{
 "metadata": {
  "generated_at": "2026-06-12T19:17:01+00:00",
  "generator": "arena/scripts/generate_sweep_queue.py",
  "open_items": 2002,
  "corpora": 143,
  "models": [
   "google/gemini-3.5-flash",
   "anthropic/claude-haiku-4.5",
   "anthropic/claude-fable-5",
   "anthropic/claude-opus-4.8",
   "anthropic/claude-sonnet-4.6",
   "openai/gpt-5.5",
   "google/gemini-3.1-pro-preview"
  ],
  "conditions": [
   "naive",
   "coached"
  ],
  "coverage_source": "queried public run_cards (read-only) at generation time; 0 scored runs on the board",
  "priority_model": "expected-chain-value v3: items are ranked by ECV = ΔΦ / cost — the expected gain in quality-weighted mesh efficiency Φ from publishing this run, per estimated dollar. Φ averages, over all ordered language pairs, the best chain strength Q(u,v) = max over paths of λ^(hops−1)·Π(chrF++/100 per edge). Each item's predicted score is pair prior + model offset + condition offset + UCB exploration bonus, and every component is published on the item so the ranking can be re-derived by hand. Normative definition, philosophy, and citations: https://mtevalarena.org/docs/specifications/queue-construction",
  "priority_parameters": {
   "formula_version": "ecv-v3",
   "lambda_junction_discount": 0.9,
   "kappa_exploration_scale": 0.05,
   "strength_cap": 0.95,
   "cost_floor_usd": 0.01,
   "prior_fallback": 0.5,
   "reliability_thresholds": {
    "n_full": 100,
    "effective_words_healthy": 5.0,
    "ci_half_noise_floor": 5.0,
    "runs_full": 2
   },
   "phi_current": 0.0,
   "scored_runs_used": 0,
   "scored_edges": 0,
   "languages_in_graph": 59
  },
  "cost_basis": "Cost estimates come from the 2026-06 baseline sweep manifest (arena/eval/logs/sweep_manifest.json: 457 successful runs, $61.51 total). 'observed' = exact cost of the same corpus x model naive run; 'extrapolated' = that model's sweep-average cost per entry x corpus entry count. Your cost varies with provider pricing at run time.",
  "how_to_run": "Install: curl -fsSL champollion.dev/harness | bash ; set OPENROUTER_API_KEY; then paste any item's run_command. Items with corpus_fetch are not hosted anywhere by us: run them from your arena checkout and the harness downloads the pinned Tatoeba Challenge export (~169 MB, cached once for all pairs), rebuilds the corpus locally, and verifies its sha256 against the registry. Coached items: write your own coaching file first — see https://mtevalarena.org/docs/tutorials/coached-llm-prompting",
  "how_to_publish": "mt-eval publish <report.json> — sign in via OAuth when prompted. Community submissions land at the 'self-benchmarked' trust tier with your name attached; that is the trust model working as designed.",
  "dedupe_note": "No claim-locking: pick any open item. Run-card fingerprints (SHA-256 of dataset hash + model + condition + system prompt) deduplicate identical runs on publish, and independent replications of the same item are scientifically useful, not wasted.",
  "license_note": "Queued corpora are CC-BY family (Tatoeba-derived) and carry do_not_train: true — they are evaluation sets, not training data. NC-licensed and quarantined corpora are excluded from this queue."
 },
 "items": [
  {
   "priority": 1,
   "id": "eng-zul-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0104,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.01404936,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Zulu\" --yes"
  },
  {
   "priority": 2,
   "id": "spa-que-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0084,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.01017534,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Quechua\" --yes"
  },
  {
   "priority": 3,
   "id": "por-glg-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0068,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00993571,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Galician\" --yes"
  },
  {
   "priority": 4,
   "id": "eng-hau-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0095,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00987726,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 5,
   "id": "eng-xho-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0071,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00949153,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Xhosa\" --yes"
  },
  {
   "priority": 6,
   "id": "eng-war-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0092,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.0094097,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Waray\" --yes"
  },
  {
   "priority": 7,
   "id": "eng-mlt-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0102,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.0091679,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 8,
   "id": "eng-kaz-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0094,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.0091467,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Kazakh\" --yes"
  },
  {
   "priority": 9,
   "id": "eng-amh-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0098,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00911455,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Amharic\" --yes"
  },
  {
   "priority": 10,
   "id": "eng-zsm-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0107,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00909453,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Standard Malay\" --yes"
  },
  {
   "priority": 11,
   "id": "dan-fao-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0159,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00907925,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Faroese\" --yes"
  },
  {
   "priority": 12,
   "id": "deu-por-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00896401,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model anthropic/claude-haiku-4.5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 13,
   "id": "eng-glg-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00896401,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model anthropic/claude-haiku-4.5 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 14,
   "id": "spa-ita-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00896401,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 15,
   "id": "deu-por-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00896401,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model anthropic/claude-haiku-4.5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 16,
   "id": "eng-glg-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00896401,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model anthropic/claude-haiku-4.5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 17,
   "id": "spa-ita-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00896401,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 18,
   "id": "eng-ilo-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0077,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00891292,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Ilocano\" --yes"
  },
  {
   "priority": 19,
   "id": "spa-glg-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.016,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00885995,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model anthropic/claude-haiku-4.5 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 20,
   "id": "spa-glg-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.016,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00885995,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model anthropic/claude-haiku-4.5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 21,
   "id": "eng-eus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0165,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00885536,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 22,
   "id": "jpn-kor-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0165,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00885536,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 23,
   "id": "nld-rus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0165,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00885536,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 24,
   "id": "eng-eus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0165,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00885536,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 25,
   "id": "jpn-kor-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0165,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00885536,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 26,
   "id": "nld-rus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0165,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00885536,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 27,
   "id": "deu-cat-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0167,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0087493,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 28,
   "id": "deu-cat-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0167,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0087493,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 29,
   "id": "nld-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0168,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00869723,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 30,
   "id": "nld-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0168,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00869723,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 31,
   "id": "eng-ceb-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0092,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00862069,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Cebuano\" --yes"
  },
  {
   "priority": 32,
   "id": "ita-nld-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.017,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0085949,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 33,
   "id": "ita-nld-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.017,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0085949,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 34,
   "id": "cmn-vie-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0171,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00854464,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 35,
   "id": "cmn-vie-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0171,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00854464,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 36,
   "id": "ita-dan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0149,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00851184,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 37,
   "id": "nld-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0149,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00851184,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 38,
   "id": "ita-dan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0149,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00851184,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 39,
   "id": "nld-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0149,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00851184,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 40,
   "id": "fra-dan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0147,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.0084865,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 41,
   "id": "fra-dan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0147,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.0084865,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 42,
   "id": "fra-vie-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0144,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00837717,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 43,
   "id": "por-dan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0144,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00837717,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 44,
   "id": "fra-vie-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0144,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00837717,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 45,
   "id": "por-dan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0144,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00837717,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 46,
   "id": "deu-tgl-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0175,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00834934,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 47,
   "id": "deu-tgl-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0175,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00834934,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 48,
   "id": "eng-cat-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0176,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0083019,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 49,
   "id": "eng-cat-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0176,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0083019,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 50,
   "id": "deu-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0139,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00823701,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 51,
   "id": "spa-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0139,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00823701,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 52,
   "id": "deu-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0139,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00823701,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 53,
   "id": "spa-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0139,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00823701,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 54,
   "id": "dan-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0137,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00821008,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 55,
   "id": "dan-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0137,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00821008,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 56,
   "id": "cmn-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0178,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00820862,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 57,
   "id": "cmn-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0178,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00820862,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 58,
   "id": "spa-cat-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0136,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00812433,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 59,
   "id": "spa-cat-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0136,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00812433,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 60,
   "id": "eng-tuk-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0181,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00807256,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Turkmen\" --yes"
  },
  {
   "priority": 61,
   "id": "spa-dan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0181,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00807256,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 62,
   "id": "spa-dan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0181,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00807256,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 63,
   "id": "fra-kor-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0132,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00806944,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 64,
   "id": "fra-kor-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0132,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00806944,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 65,
   "id": "deu-nld-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0183,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00798434,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 66,
   "id": "deu-nld-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0183,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00798434,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 67,
   "id": "eng-zul-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0183,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00798434,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Zulu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 68,
   "id": "deu-kor-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0131,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00798158,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 69,
   "id": "nld-dan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0131,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00798158,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 70,
   "id": "por-ita-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0131,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00798158,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 71,
   "id": "por-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0131,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00798158,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 72,
   "id": "deu-kor-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0131,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00798158,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 73,
   "id": "nld-dan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0131,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00798158,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 74,
   "id": "por-ita-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0131,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00798158,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 75,
   "id": "por-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0131,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00798158,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 76,
   "id": "fra-hau-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0115,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00795365,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 77,
   "id": "por-nld-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0129,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00795355,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 78,
   "id": "por-nld-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0129,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00795355,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 79,
   "id": "deu-dan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0184,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00794094,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 80,
   "id": "deu-rus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0184,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00794094,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 81,
   "id": "deu-dan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0184,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00794094,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 82,
   "id": "deu-rus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0184,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00794094,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 83,
   "id": "eng-que-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0126,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00783446,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model anthropic/claude-haiku-4.5 --target-lang \"Quechua\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 84,
   "id": "ita-vie-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0126,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00783446,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 85,
   "id": "eng-que-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0126,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00783446,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model anthropic/claude-haiku-4.5 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 86,
   "id": "ita-vie-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0126,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00783446,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 87,
   "id": "eng-uzb-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0118,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00782573,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 88,
   "id": "eng-fry-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0124,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00780764,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model anthropic/claude-haiku-4.5 --target-lang \"Western Frisian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 89,
   "id": "eng-fry-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0124,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00780764,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model anthropic/claude-haiku-4.5 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 90,
   "id": "spa-eus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0122,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00777994,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 91,
   "id": "eng-xho-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0122,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00777994,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Xhosa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 92,
   "id": "spa-eus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0122,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00777994,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 93,
   "id": "dan-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0189,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00773087,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 94,
   "id": "dan-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0189,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00773087,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 95,
   "id": "spa-kor-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0121,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00768725,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 96,
   "id": "spa-kor-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0121,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00768725,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 97,
   "id": "fra-tgl-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0119,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00765929,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 98,
   "id": "eng-amh-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0119,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00765929,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Amharic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 99,
   "id": "fra-tgl-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0119,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00765929,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 100,
   "id": "eng-nld-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0191,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00764992,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes"
  },
  {
   "priority": 101,
   "id": "deu-vie-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0193,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00757064,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 102,
   "id": "deu-vie-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0193,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00757064,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 103,
   "id": "eng-tuk-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0193,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00757064,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Turkmen\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 104,
   "id": "deu-ceb-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.0075657,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model anthropic/claude-haiku-4.5 --target-lang \"Cebuano\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 105,
   "id": "nld-cat-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.0075657,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 106,
   "id": "deu-ceb-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.0075657,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model anthropic/claude-haiku-4.5 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 107,
   "id": "nld-cat-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.0075657,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 108,
   "id": "eng-tel-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0116,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00753492,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Telugu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 109,
   "id": "eng-ltz-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0114,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00750561,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model anthropic/claude-haiku-4.5 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 110,
   "id": "por-tgl-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0114,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00750561,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 111,
   "id": "eng-ltz-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0114,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00750561,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model anthropic/claude-haiku-4.5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 112,
   "id": "por-tgl-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0114,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00750561,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 113,
   "id": "eng-lug-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0123,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00748386,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Ganda\" --yes"
  },
  {
   "priority": 114,
   "id": "eng-tel-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0117,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00747052,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Telugu\" --yes"
  },
  {
   "priority": 115,
   "id": "spa-vie-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.0074117,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 116,
   "id": "spa-vie-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.0074117,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 117,
   "id": "por-cat-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0111,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00738202,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 118,
   "id": "eng-lao-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0111,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00738202,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Lao\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 119,
   "id": "eng-pan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0111,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00738202,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Panjabi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 120,
   "id": "por-cat-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0111,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00738202,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 121,
   "id": "eng-por-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0198,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00737946,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Portuguese (Brazilian)\" --yes"
  },
  {
   "priority": 122,
   "id": "jpn-eus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0199,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00734238,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 123,
   "id": "rus-eus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0199,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00734238,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 124,
   "id": "jpn-eus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0199,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00734238,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 125,
   "id": "rus-eus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0199,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00734238,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 126,
   "id": "deu-ita-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0201,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00726932,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 127,
   "id": "deu-ita-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0201,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00726932,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 128,
   "id": "por-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0106,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00722296,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 129,
   "id": "por-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0106,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00722296,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 130,
   "id": "deu-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00716242,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 131,
   "id": "ita-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00716242,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 132,
   "id": "por-rus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00716242,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 133,
   "id": "deu-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00716242,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 134,
   "id": "ita-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00716242,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 135,
   "id": "por-rus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00716242,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 136,
   "id": "ita-cat-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0103,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00709288,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 137,
   "id": "ita-cat-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0103,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00709288,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 138,
   "id": "eng-fao-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0101,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00706263,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model anthropic/claude-haiku-4.5 --target-lang \"Faroese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 139,
   "id": "eng-fao-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0101,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00706263,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model anthropic/claude-haiku-4.5 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 140,
   "id": "ita-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0207,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00705862,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 141,
   "id": "ita-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0207,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00705862,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 142,
   "id": "eng-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0209,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00699107,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes"
  },
  {
   "priority": 143,
   "id": "spa-tgl-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.01,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00696084,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 144,
   "id": "spa-tgl-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.01,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00696084,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 145,
   "id": "eng-spa-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0212,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00689214,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Spanish\" --yes"
  },
  {
   "priority": 146,
   "id": "eng-kan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.01,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00689071,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Kannada\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 147,
   "id": "spa-fra-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0214,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00682773,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model anthropic/claude-haiku-4.5 --target-lang \"French\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 148,
   "id": "spa-fra-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0214,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00682773,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model anthropic/claude-haiku-4.5 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 149,
   "id": "eng-tgl-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0215,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00679597,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes"
  },
  {
   "priority": 150,
   "id": "eng-pag-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0057,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00679135,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Pangasinan\" --yes"
  },
  {
   "priority": 151,
   "id": "eng-pag-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0098,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00679135,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Pangasinan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 152,
   "id": "eng-fra-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0217,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00673334,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"French\" --yes"
  },
  {
   "priority": 153,
   "id": "spa-nld-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0217,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00673334,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 154,
   "id": "spa-nld-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0217,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00673334,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 155,
   "id": "spa-deu-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.022,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00664152,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model anthropic/claude-haiku-4.5 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 156,
   "id": "spa-deu-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.022,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00664152,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model anthropic/claude-haiku-4.5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 157,
   "id": "deu-ltz-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0126,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00663308,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 158,
   "id": "eng-mon-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0126,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00663308,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Mongolian\" --yes"
  },
  {
   "priority": 159,
   "id": "spa-que-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0155,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00656473,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 160,
   "id": "fra-nld-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0224,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00652292,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 161,
   "id": "fra-nld-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0224,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00652292,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 162,
   "id": "fra-ita-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0225,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00649393,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 163,
   "id": "fra-ita-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0225,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00649393,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 164,
   "id": "eng-yor-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0066,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00645529,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Yoruba\" --yes"
  },
  {
   "priority": 165,
   "id": "spa-rus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0227,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00643671,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 166,
   "id": "spa-rus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0227,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00643671,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 167,
   "id": "fra-ltz-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0135,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00642899,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 168,
   "id": "eng-sin-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0091,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00639684,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Sinhala\" --yes"
  },
  {
   "priority": 169,
   "id": "eng-deu-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0229,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0063805,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"German\" --yes"
  },
  {
   "priority": 170,
   "id": "rus-dan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.023,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00635276,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 171,
   "id": "rus-dan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.023,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00635276,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 172,
   "id": "eng-haw-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0181,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00634503,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Hawaiian\" --yes"
  },
  {
   "priority": 173,
   "id": "fra-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0232,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00629799,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 174,
   "id": "spa-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0232,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00629799,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 175,
   "id": "fra-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0232,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00629799,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 176,
   "id": "spa-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0232,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00629799,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 177,
   "id": "ita-mlt-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0142,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00629728,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 178,
   "id": "fra-por-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0233,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00627096,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model anthropic/claude-haiku-4.5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 179,
   "id": "spa-por-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0233,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00627096,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model anthropic/claude-haiku-4.5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 180,
   "id": "fra-por-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0233,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00627096,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model anthropic/claude-haiku-4.5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 181,
   "id": "spa-por-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0233,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00627096,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model anthropic/claude-haiku-4.5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 182,
   "id": "eng-hil-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0051,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00612215,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Hiligaynon\" --yes"
  },
  {
   "priority": 183,
   "id": "nld-ltz-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0091,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00612215,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model anthropic/claude-haiku-4.5 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 184,
   "id": "eng-hil-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0091,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00612215,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Hiligaynon\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 185,
   "id": "nld-ltz-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0091,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00612215,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model anthropic/claude-haiku-4.5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 186,
   "id": "eng-guj-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.024,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00608806,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Gujarati\" --yes"
  },
  {
   "priority": 187,
   "id": "eng-sme-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0052,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00594974,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Northern Sámi\" --yes"
  },
  {
   "priority": 188,
   "id": "eng-sme-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0095,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00594974,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Northern Sámi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 189,
   "id": "por-glg-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0167,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00594953,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 190,
   "id": "fra-deu-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0247,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00591552,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model anthropic/claude-haiku-4.5 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 191,
   "id": "fra-deu-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0247,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00591552,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model anthropic/claude-haiku-4.5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 192,
   "id": "rus-cmn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0251,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00582125,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 193,
   "id": "rus-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0251,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00582125,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 194,
   "id": "eng-yor-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0111,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00581558,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Yoruba\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 195,
   "id": "fra-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00577523,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 196,
   "id": "fra-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00577523,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 197,
   "id": "ita-rus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0256,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00570755,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 198,
   "id": "ita-rus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0256,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00570755,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 199,
   "id": "eng-sin-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00566092,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Sinhala\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 200,
   "id": "deu-eus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0261,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00559821,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 201,
   "id": "deu-eus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0261,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00559821,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 202,
   "id": "eng-arb-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0262,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00557685,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Standard Arabic\" --yes"
  },
  {
   "priority": 203,
   "id": "rus-vie-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0263,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00555564,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 204,
   "id": "rus-vie-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0263,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00555564,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 205,
   "id": "fra-rus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0266,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00549298,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 206,
   "id": "fra-rus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0266,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00549298,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 207,
   "id": "eng-dan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00543172,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 208,
   "id": "eng-dan-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00543172,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model anthropic/claude-haiku-4.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 209,
   "id": "eng-guj-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00543172,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Gujarati\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 210,
   "id": "rus-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0274,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00533261,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 211,
   "id": "rus-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0274,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00533261,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 212,
   "id": "dan-fao-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0274,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00526861,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 213,
   "id": "eng-jpn-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0279,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00523704,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes"
  },
  {
   "priority": 214,
   "id": "eng-ilo-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0171,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00521223,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Ilocano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 215,
   "id": "eng-kor-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0284,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00514484,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes"
  },
  {
   "priority": 216,
   "id": "eng-mya-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0111,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00512318,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Burmese\" --yes"
  },
  {
   "priority": 217,
   "id": "cmn-kor-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0289,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00505583,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 218,
   "id": "cmn-kor-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0289,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00505583,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 219,
   "id": "eng-tha-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.029,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00503839,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Thai\" --yes"
  },
  {
   "priority": 220,
   "id": "eng-kaz-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0184,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00497103,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 221,
   "id": "eng-vie-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0298,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00490313,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes"
  },
  {
   "priority": 222,
   "id": "eng-ita-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.03,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00487045,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 223,
   "id": "eng-ita-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.03,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00487045,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model anthropic/claude-haiku-4.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 224,
   "id": "por-glg-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0205,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00484669,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Galician\" --yes"
  },
  {
   "priority": 225,
   "id": "eng-tir-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0088,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00482466,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Tigrinya\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 226,
   "id": "eng-rus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0304,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00480636,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 227,
   "id": "eng-rus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0304,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00480636,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model anthropic/claude-haiku-4.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 228,
   "id": "eng-tam-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0204,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00479882,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Tamil\" --yes"
  },
  {
   "priority": 229,
   "id": "eng-mal-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.009,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00473992,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Malayalam\" --yes"
  },
  {
   "priority": 230,
   "id": "eng-mal-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0096,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00473992,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Malayalam\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 231,
   "id": "eng-zul-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0313,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00466816,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Zulu\" --yes"
  },
  {
   "priority": 232,
   "id": "eng-urd-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0192,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00462692,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Urdu\" --yes"
  },
  {
   "priority": 233,
   "id": "nld-fry-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0044,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00460842,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Western Frisian\" --yes"
  },
  {
   "priority": 234,
   "id": "nld-fry-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0095,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00460842,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 235,
   "id": "eng-bos-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0041,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.0045938,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Bosnian\" --yes"
  },
  {
   "priority": 236,
   "id": "eng-xho-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0209,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.0045414,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Xhosa\" --yes"
  },
  {
   "priority": 237,
   "id": "rus-kor-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0323,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00452363,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 238,
   "id": "rus-kor-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0323,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00452363,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 239,
   "id": "eng-mya-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0126,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00451328,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Burmese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 240,
   "id": "jpn-vie-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 241,
   "id": "rus-kaz-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model anthropic/claude-haiku-4.5 --target-lang \"Kazakh\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 242,
   "id": "eng-arb-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Standard Arabic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 243,
   "id": "eng-cmn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 244,
   "id": "eng-deu-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 245,
   "id": "eng-fra-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 246,
   "id": "eng-jpn-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 247,
   "id": "eng-kor-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 248,
   "id": "eng-nld-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 249,
   "id": "eng-por-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 250,
   "id": "eng-spa-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Spanish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 251,
   "id": "eng-tgl-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 252,
   "id": "eng-tha-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Thai\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 253,
   "id": "eng-vie-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 254,
   "id": "jpn-vie-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model anthropic/claude-haiku-4.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 255,
   "id": "rus-kaz-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0044683,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model anthropic/claude-haiku-4.5 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 256,
   "id": "eng-mlt-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0211,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00443188,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 257,
   "id": "eng-bos-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0104,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00441712,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Bosnian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 258,
   "id": "eng-war-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0214,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00439706,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Waray\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 259,
   "id": "deu-por-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0337,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00433571,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 260,
   "id": "eng-glg-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0337,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00433571,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 261,
   "id": "spa-ita-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0337,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00433571,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 262,
   "id": "deu-por-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0337,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00433571,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 263,
   "id": "eng-glg-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0337,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00433571,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 264,
   "id": "spa-ita-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0337,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00433571,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 265,
   "id": "eng-sna-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0039,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.0043308,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Shona\" --yes"
  },
  {
   "priority": 266,
   "id": "eng-sna-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0077,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.0043308,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Shona\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 267,
   "id": "eng-hau-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0229,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00431322,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 268,
   "id": "eng-eus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.034,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00429745,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 269,
   "id": "jpn-kor-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.034,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00429745,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 270,
   "id": "nld-rus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.034,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00429745,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 271,
   "id": "eng-eus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.034,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00429745,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 272,
   "id": "jpn-kor-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.034,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00429745,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 273,
   "id": "nld-rus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.034,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00429745,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 274,
   "id": "spa-glg-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.033,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00429573,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 275,
   "id": "spa-glg-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.033,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00429573,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 276,
   "id": "deu-cat-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0344,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00424748,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 277,
   "id": "deu-cat-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0344,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00424748,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 278,
   "id": "nld-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0347,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00421076,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 279,
   "id": "nld-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0347,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00421076,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 280,
   "id": "eng-kan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0165,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00417619,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Kannada\" --yes"
  },
  {
   "priority": 281,
   "id": "ita-nld-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.035,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00417467,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 282,
   "id": "ita-nld-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.035,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00417467,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 283,
   "id": "ita-dan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0307,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00413115,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 284,
   "id": "nld-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0307,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00413115,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 285,
   "id": "ita-dan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0307,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00413115,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 286,
   "id": "nld-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0307,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00413115,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 287,
   "id": "cmn-vie-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0354,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0041275,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 288,
   "id": "cmn-vie-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0354,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0041275,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 289,
   "id": "fra-dan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0303,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00411721,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 290,
   "id": "fra-dan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0303,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00411721,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 291,
   "id": "fra-vie-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0297,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00406166,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 292,
   "id": "por-dan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0297,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00406166,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 293,
   "id": "fra-vie-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0297,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00406166,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 294,
   "id": "por-dan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0297,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00406166,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 295,
   "id": "deu-tgl-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0361,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00404746,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 296,
   "id": "deu-tgl-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0361,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00404746,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 297,
   "id": "eng-zsm-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0242,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00402114,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Standard Malay\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 298,
   "id": "eng-cat-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0364,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0040141,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 299,
   "id": "eng-cat-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0364,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0040141,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 300,
   "id": "deu-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0286,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.0040033,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 301,
   "id": "spa-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0286,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.0040033,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 302,
   "id": "deu-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0286,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.0040033,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 303,
   "id": "spa-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0286,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.0040033,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 304,
   "id": "eng-ceb-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0216,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00399106,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 305,
   "id": "cmn-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0367,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00398129,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 306,
   "id": "cmn-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0367,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00398129,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 307,
   "id": "dan-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0283,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00397449,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 308,
   "id": "dan-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0283,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00397449,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 309,
   "id": "eng-pag-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0171,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00397155,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Pangasinan\" --yes"
  },
  {
   "priority": 310,
   "id": "eng-hil-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0155,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00394977,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Hiligaynon\" --yes"
  },
  {
   "priority": 311,
   "id": "spa-cat-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.028,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00394611,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 312,
   "id": "spa-cat-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.028,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00394611,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 313,
   "id": "eng-tam-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.025,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00391584,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Tamil\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 314,
   "id": "eng-sme-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0152,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.0039143,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Northern Sámi\" --yes"
  },
  {
   "priority": 315,
   "id": "spa-dan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0374,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00390677,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 316,
   "id": "spa-dan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0374,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00390677,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 317,
   "id": "fra-kor-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0273,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00390171,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 318,
   "id": "fra-kor-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0273,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00390171,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 319,
   "id": "eng-ilo-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0229,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.0038921,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Ilocano\" --yes"
  },
  {
   "priority": 320,
   "id": "deu-nld-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0377,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00387569,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 321,
   "id": "deu-nld-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0377,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00387569,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 322,
   "id": "eng-zul-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0377,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00387569,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Zulu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 323,
   "id": "deu-kor-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.027,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00387255,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 324,
   "id": "nld-dan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.027,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00387255,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 325,
   "id": "por-ita-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.027,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00387255,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 326,
   "id": "por-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.027,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00387255,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 327,
   "id": "deu-kor-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.027,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00387255,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 328,
   "id": "nld-dan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.027,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00387255,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 329,
   "id": "por-ita-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.027,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00387255,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 330,
   "id": "por-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.027,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00387255,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 331,
   "id": "por-nld-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0266,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00385717,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 332,
   "id": "por-nld-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0266,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00385717,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 333,
   "id": "spa-que-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0265,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00383975,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Quechua\" --yes"
  },
  {
   "priority": 334,
   "id": "deu-dan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0381,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.003835,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 335,
   "id": "deu-rus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0381,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.003835,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 336,
   "id": "deu-dan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0381,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.003835,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 337,
   "id": "deu-rus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0381,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.003835,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 338,
   "id": "eng-sna-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0113,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00383257,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Shona\" --yes"
  },
  {
   "priority": 339,
   "id": "eng-que-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0259,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00381136,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Quechua\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 340,
   "id": "ita-vie-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0259,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00381136,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 341,
   "id": "eng-que-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0259,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00381136,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 342,
   "id": "ita-vie-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0259,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00381136,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 343,
   "id": "eng-fry-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0256,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00378183,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Western Frisian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 344,
   "id": "eng-fry-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0256,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00378183,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 345,
   "id": "eng-bos-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0122,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00376541,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Bosnian\" --yes"
  },
  {
   "priority": 346,
   "id": "spa-eus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00375159,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 347,
   "id": "eng-xho-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00375159,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Xhosa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 348,
   "id": "spa-eus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00375159,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 349,
   "id": "dan-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0391,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00373692,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 350,
   "id": "dan-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0391,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00373692,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 351,
   "id": "spa-kor-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0249,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00373557,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 352,
   "id": "spa-kor-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0249,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00373557,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 353,
   "id": "eng-mon-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0225,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00371453,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Mongolian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 354,
   "id": "fra-tgl-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0246,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.0037051,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 355,
   "id": "eng-amh-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0246,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.0037051,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Amharic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 356,
   "id": "fra-tgl-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0246,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.0037051,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 357,
   "id": "deu-ceb-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0243,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00367388,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Cebuano\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 358,
   "id": "nld-cat-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0243,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00367388,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 359,
   "id": "deu-ceb-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0243,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00367388,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 360,
   "id": "nld-cat-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0243,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00367388,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 361,
   "id": "deu-vie-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0398,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00367119,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 362,
   "id": "deu-vie-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0398,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00367119,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 363,
   "id": "eng-tuk-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0398,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00367119,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Turkmen\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 364,
   "id": "eng-tel-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0239,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00365711,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Telugu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 365,
   "id": "eng-ltz-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0236,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00362559,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 366,
   "id": "por-tgl-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0236,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00362559,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 367,
   "id": "eng-ltz-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0236,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00362559,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 368,
   "id": "por-tgl-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0236,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00362559,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 369,
   "id": "eng-haw-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0317,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00362287,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Hawaiian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 370,
   "id": "spa-vie-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0233,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00359451,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 371,
   "id": "spa-vie-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0233,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00359451,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 372,
   "id": "por-cat-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0229,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00357818,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 373,
   "id": "eng-lao-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0229,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00357818,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Lao\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 374,
   "id": "eng-pan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0229,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00357818,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Panjabi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 375,
   "id": "por-cat-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0229,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00357818,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 376,
   "id": "jpn-eus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0411,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00355507,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 377,
   "id": "rus-eus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0411,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00355507,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 378,
   "id": "jpn-eus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0411,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00355507,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 379,
   "id": "rus-eus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0411,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00355507,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 380,
   "id": "deu-ita-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0414,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00352931,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 381,
   "id": "deu-ita-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0414,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00352931,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 382,
   "id": "eng-lao-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0233,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00351675,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Lao\" --yes"
  },
  {
   "priority": 383,
   "id": "rus-uzb-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0044,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00351257,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 384,
   "id": "rus-uzb-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0083,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00351257,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 385,
   "id": "por-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0219,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00349605,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 386,
   "id": "por-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0219,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00349605,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 387,
   "id": "deu-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0421,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00347063,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 388,
   "id": "ita-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0421,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00347063,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 389,
   "id": "por-rus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0421,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00347063,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 390,
   "id": "deu-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0421,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00347063,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 391,
   "id": "ita-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0421,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00347063,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 392,
   "id": "por-rus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0421,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00347063,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 393,
   "id": "ita-cat-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0212,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00344607,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 394,
   "id": "ita-cat-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0212,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00344607,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 395,
   "id": "nld-fry-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0134,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00343912,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Western Frisian\" --yes"
  },
  {
   "priority": 396,
   "id": "ita-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0428,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00341386,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 397,
   "id": "ita-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0428,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00341386,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 398,
   "id": "eng-fao-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0209,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00341304,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Faroese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 399,
   "id": "eng-fao-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0209,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00341304,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 400,
   "id": "eng-uzb-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0273,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00338255,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 401,
   "id": "spa-tgl-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0206,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00337905,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 402,
   "id": "spa-tgl-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0206,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00337905,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 403,
   "id": "eng-pag-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0202,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00336205,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Pangasinan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 404,
   "id": "eng-kan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0206,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.003345,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Kannada\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 405,
   "id": "fra-hau-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0274,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00333821,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 406,
   "id": "spa-fra-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0441,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00331323,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model anthropic/claude-sonnet-4.6 --target-lang \"French\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 407,
   "id": "spa-fra-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0441,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00331323,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model anthropic/claude-sonnet-4.6 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 408,
   "id": "fra-eus-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0036,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00331093,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes"
  },
  {
   "priority": 409,
   "id": "fra-eus-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0096,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00331093,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 410,
   "id": "eng-hau-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.03,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00329242,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 411,
   "id": "eng-war-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0287,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00327864,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Waray\" --yes"
  },
  {
   "priority": 412,
   "id": "spa-nld-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0448,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00326146,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 413,
   "id": "spa-nld-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0448,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00326146,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 414,
   "id": "nld-ltz-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0189,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00323923,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 415,
   "id": "eng-hil-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0189,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00323923,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Hiligaynon\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 416,
   "id": "nld-ltz-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0189,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00323923,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 417,
   "id": "eng-pam-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0034,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00322618,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Kapampangan\" --yes"
  },
  {
   "priority": 418,
   "id": "eng-pam-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0078,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00322618,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Kapampangan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 419,
   "id": "spa-deu-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0455,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00321128,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model anthropic/claude-sonnet-4.6 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 420,
   "id": "spa-deu-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0455,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00321128,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model anthropic/claude-sonnet-4.6 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 421,
   "id": "spa-que-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.032,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00317979,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 422,
   "id": "eng-pam-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0102,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00316293,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Kapampangan\" --yes"
  },
  {
   "priority": 423,
   "id": "fra-nld-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0462,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00316263,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 424,
   "id": "fra-nld-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0462,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00316263,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 425,
   "id": "eng-ceb-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0273,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00315776,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Cebuano\" --yes"
  },
  {
   "priority": 426,
   "id": "eng-kaz-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.029,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00315403,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Kazakh\" --yes"
  },
  {
   "priority": 427,
   "id": "fra-ita-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0465,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00314222,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 428,
   "id": "fra-ita-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0465,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00314222,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 429,
   "id": "spa-rus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0468,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00312208,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 430,
   "id": "spa-rus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0468,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00312208,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 431,
   "id": "eng-yor-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0208,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.0031035,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Yoruba\" --yes"
  },
  {
   "priority": 432,
   "id": "eng-lug-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0299,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00307864,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Ganda\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 433,
   "id": "rus-dan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0475,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00307607,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 434,
   "id": "rus-dan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0475,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00307607,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 435,
   "id": "eng-amh-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0298,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00305857,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Amharic\" --yes"
  },
  {
   "priority": 436,
   "id": "eng-sme-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00305115,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Northern Sámi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 437,
   "id": "eng-cym-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0037,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00305085,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Welsh\" --yes"
  },
  {
   "priority": 438,
   "id": "eng-cym-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0077,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00305085,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Welsh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 439,
   "id": "fra-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0479,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00305038,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 440,
   "id": "spa-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0479,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00305038,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 441,
   "id": "fra-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0479,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00305038,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 442,
   "id": "spa-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0479,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00305038,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 443,
   "id": "ita-mlt-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0294,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00304154,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 444,
   "id": "fra-eus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0109,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00303755,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes"
  },
  {
   "priority": 445,
   "id": "eng-mlt-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0308,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00303612,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 446,
   "id": "fra-por-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0482,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0030314,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 447,
   "id": "spa-por-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0482,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0030314,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 448,
   "id": "fra-por-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0482,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0030314,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 449,
   "id": "spa-por-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0482,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0030314,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 450,
   "id": "eng-ibo-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.004,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00302455,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Igbo\" --yes"
  },
  {
   "priority": 451,
   "id": "eng-ibo-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0057,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00302455,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Igbo\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 452,
   "id": "eng-zsm-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0324,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00300344,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Standard Malay\" --yes"
  },
  {
   "priority": 453,
   "id": "eng-urd-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0296,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00300125,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Urdu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 454,
   "id": "eng-pan-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0274,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00299053,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Panjabi\" --yes"
  },
  {
   "priority": 455,
   "id": "fra-cat-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.0034,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00295441,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes"
  },
  {
   "priority": 456,
   "id": "fra-cat-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0093,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00295441,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 457,
   "id": "dan-fao-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.049,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00294612,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Faroese\" --yes"
  },
  {
   "priority": 458,
   "id": "por-glg-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0344,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00288829,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 459,
   "id": "fra-deu-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0509,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0028706,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model anthropic/claude-sonnet-4.6 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 460,
   "id": "fra-deu-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0509,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0028706,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model anthropic/claude-sonnet-4.6 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 461,
   "id": "deu-ltz-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.0292,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00286222,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 462,
   "id": "fra-cat-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0104,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00284078,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes"
  },
  {
   "priority": 463,
   "id": "eng-yor-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0229,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.0028189,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Yoruba\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 464,
   "id": "rus-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0519,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00281529,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 465,
   "id": "rus-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0519,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00281529,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 466,
   "id": "fra-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0522,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00279911,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 467,
   "id": "fra-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0522,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00279911,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 468,
   "id": "ita-rus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0529,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00276207,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 469,
   "id": "ita-rus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0529,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00276207,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 470,
   "id": "eng-sin-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0233,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00274543,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Sinhala\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 471,
   "id": "eng-sna-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0158,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00274101,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Shona\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 472,
   "id": "fra-ltz-dev-v1__anthropic_claude-haiku-4.5__coached",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "coached",
   "est_cost_usd": 0.032,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00271223,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 473,
   "id": "deu-eus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0539,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00271082,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 474,
   "id": "deu-eus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0539,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00271082,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 475,
   "id": "eng-cym-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0113,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00269987,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Welsh\" --yes"
  },
  {
   "priority": 476,
   "id": "rus-vie-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0543,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00269085,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 477,
   "id": "rus-vie-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0543,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00269085,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 478,
   "id": "eng-tuk-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0549,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00266145,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Turkmen\" --yes"
  },
  {
   "priority": 479,
   "id": "fra-rus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0549,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00266145,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 480,
   "id": "fra-rus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0549,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00266145,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 481,
   "id": "eng-tir-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0182,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00265091,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Tigrinya\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 482,
   "id": "eng-dan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0556,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00262794,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 483,
   "id": "eng-dan-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0556,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00262794,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 484,
   "id": "eng-guj-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0556,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00262794,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Gujarati\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 485,
   "id": "eng-uzb-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0353,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00261597,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 486,
   "id": "eng-sin-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0245,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00261096,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Sinhala\" --yes"
  },
  {
   "priority": 487,
   "id": "fra-hau-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0354,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00258381,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 488,
   "id": "rus-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0566,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00258151,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 489,
   "id": "rus-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0566,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00258151,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 490,
   "id": "eng-ibo-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00256318,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Igbo\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 491,
   "id": "dan-fao-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0566,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00255053,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 492,
   "id": "rus-uzb-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0138,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00254534,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 493,
   "id": "eng-nld-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0578,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00252791,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes"
  },
  {
   "priority": 494,
   "id": "eng-tel-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0346,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00252616,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Telugu\" --yes"
  },
  {
   "priority": 495,
   "id": "eng-ilo-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0354,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00251777,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Ilocano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 496,
   "id": "eng-lug-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0366,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00251507,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Ganda\" --yes"
  },
  {
   "priority": 497,
   "id": "por-glg-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0397,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.0025027,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Galician\" --yes"
  },
  {
   "priority": 498,
   "id": "cmn-kor-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0596,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00245157,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 499,
   "id": "cmn-kor-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0596,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00245157,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 500,
   "id": "eng-por-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0598,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00244337,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Portuguese (Brazilian)\" --yes"
  },
  {
   "priority": 501,
   "id": "eng-kaz-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0381,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00240071,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 502,
   "id": "eng-mal-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0199,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00238187,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Malayalam\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 503,
   "id": "nld-fry-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00236329,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 504,
   "id": "eng-ita-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.062,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00235667,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 505,
   "id": "eng-ita-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.062,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00235667,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 506,
   "id": "deu-por-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0627,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00233036,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model anthropic/claude-opus-4.8 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 507,
   "id": "eng-glg-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0627,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00233036,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model anthropic/claude-opus-4.8 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 508,
   "id": "eng-rus-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0627,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00233036,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 509,
   "id": "spa-ita-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0627,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00233036,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 510,
   "id": "deu-por-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0627,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00233036,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model anthropic/claude-opus-4.8 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 511,
   "id": "eng-glg-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0627,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00233036,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model anthropic/claude-opus-4.8 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 512,
   "id": "eng-rus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0627,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00233036,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 513,
   "id": "spa-ita-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0627,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00233036,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 514,
   "id": "eng-cmn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.063,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00231926,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes"
  },
  {
   "priority": 515,
   "id": "eng-mon-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0362,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00230875,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Mongolian\" --yes"
  },
  {
   "priority": 516,
   "id": "eng-eus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0633,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00230827,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 517,
   "id": "jpn-kor-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0633,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00230827,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 518,
   "id": "nld-rus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0633,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00230827,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 519,
   "id": "eng-eus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0633,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00230827,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 520,
   "id": "jpn-kor-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0633,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00230827,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 521,
   "id": "nld-rus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0633,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00230827,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 522,
   "id": "spa-glg-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0615,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00230503,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model anthropic/claude-opus-4.8 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 523,
   "id": "spa-glg-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0615,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00230503,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model anthropic/claude-opus-4.8 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 524,
   "id": "eng-zul-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0638,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00229018,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Zulu\" --yes"
  },
  {
   "priority": 525,
   "id": "eng-spa-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0639,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00228659,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Spanish\" --yes"
  },
  {
   "priority": 526,
   "id": "deu-cat-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.064,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00228302,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 527,
   "id": "deu-cat-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.064,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00228302,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 528,
   "id": "eng-ibo-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0133,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.0022741,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Igbo\" --yes"
  },
  {
   "priority": 529,
   "id": "eng-tgl-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0645,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00226532,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes"
  },
  {
   "priority": 530,
   "id": "nld-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0646,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00226182,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 531,
   "id": "nld-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0646,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00226182,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 532,
   "id": "eng-xho-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0423,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00224386,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Xhosa\" --yes"
  },
  {
   "priority": 533,
   "id": "ita-nld-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0652,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.002241,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 534,
   "id": "ita-nld-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0652,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.002241,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 535,
   "id": "eng-fra-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0654,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00223415,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"French\" --yes"
  },
  {
   "priority": 536,
   "id": "deu-ltz-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0376,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00222279,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 537,
   "id": "ita-dan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0571,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00222113,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 538,
   "id": "nld-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0571,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00222113,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 539,
   "id": "ita-dan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0571,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00222113,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 540,
   "id": "nld-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0571,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00222113,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 541,
   "id": "cmn-vie-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0659,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0022172,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 542,
   "id": "cmn-vie-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0659,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0022172,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 543,
   "id": "fra-dan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0564,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00221191,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 544,
   "id": "fra-dan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0564,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00221191,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 545,
   "id": "eng-mya-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0259,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00219565,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Burmese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 546,
   "id": "rus-kor-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0667,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00219061,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 547,
   "id": "rus-kor-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0667,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00219061,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 548,
   "id": "ita-mlt-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0409,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00218634,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 549,
   "id": "fra-vie-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0552,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00218535,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 550,
   "id": "por-dan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0552,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00218535,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 551,
   "id": "fra-vie-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0552,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00218535,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 552,
   "id": "por-dan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0552,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00218535,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 553,
   "id": "deu-tgl-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0671,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00217755,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 554,
   "id": "deu-tgl-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0671,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00217755,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 555,
   "id": "jpn-vie-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 556,
   "id": "rus-kaz-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Kazakh\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 557,
   "id": "eng-arb-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Standard Arabic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 558,
   "id": "eng-cmn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 559,
   "id": "eng-deu-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 560,
   "id": "eng-fra-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 561,
   "id": "eng-jpn-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 562,
   "id": "eng-kor-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 563,
   "id": "eng-nld-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 564,
   "id": "eng-por-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 565,
   "id": "eng-spa-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Spanish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 566,
   "id": "eng-tgl-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 567,
   "id": "eng-tha-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Thai\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 568,
   "id": "eng-vie-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 569,
   "id": "jpn-vie-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 570,
   "id": "rus-kaz-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0674,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00216785,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model anthropic/claude-sonnet-4.6 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 571,
   "id": "eng-tir-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0223,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00216353,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Tigrinya\" --yes"
  },
  {
   "priority": 572,
   "id": "eng-cat-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0677,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00215825,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 573,
   "id": "eng-cat-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0677,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00215825,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 574,
   "id": "eng-mlt-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0435,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00214971,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 575,
   "id": "deu-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0533,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00214811,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 576,
   "id": "spa-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0533,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00214811,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 577,
   "id": "deu-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0533,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00214811,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 578,
   "id": "spa-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0533,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00214811,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 579,
   "id": "cmn-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0684,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00213616,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 580,
   "id": "cmn-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0684,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00213616,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 581,
   "id": "dan-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0527,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00213431,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 582,
   "id": "dan-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0527,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00213431,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 583,
   "id": "eng-war-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0441,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00213372,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Waray\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 584,
   "id": "eng-bos-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0216,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00212676,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Bosnian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 585,
   "id": "spa-cat-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0521,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00212075,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 586,
   "id": "spa-cat-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0521,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00212075,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 587,
   "id": "fra-ltz-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.041,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00211686,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 588,
   "id": "eng-deu-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0692,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00211147,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"German\" --yes"
  },
  {
   "priority": 589,
   "id": "spa-dan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0696,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00209933,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 590,
   "id": "spa-dan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0696,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00209933,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 591,
   "id": "fra-kor-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0508,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00209678,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 592,
   "id": "fra-kor-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0508,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00209678,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 593,
   "id": "eng-hau-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0472,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00209264,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 594,
   "id": "deu-kor-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0502,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00208284,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 595,
   "id": "nld-dan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0502,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00208284,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 596,
   "id": "por-ita-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0502,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00208284,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 597,
   "id": "por-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0502,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00208284,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 598,
   "id": "deu-kor-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0502,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00208284,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 599,
   "id": "nld-dan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0502,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00208284,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 600,
   "id": "por-ita-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0502,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00208284,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 601,
   "id": "por-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0502,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00208284,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 602,
   "id": "deu-nld-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0702,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00208139,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 603,
   "id": "deu-nld-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0702,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00208139,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 604,
   "id": "eng-zul-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0702,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00208139,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Zulu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 605,
   "id": "por-nld-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0495,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00207274,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 606,
   "id": "por-nld-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0495,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00207274,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 607,
   "id": "deu-dan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0709,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00206084,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 608,
   "id": "deu-rus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0709,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00206084,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 609,
   "id": "deu-dan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0709,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00206084,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 610,
   "id": "deu-rus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0709,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00206084,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 611,
   "id": "eng-haw-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0558,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00205816,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Hawaiian\" --yes"
  },
  {
   "priority": 612,
   "id": "eng-que-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0483,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00204377,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model anthropic/claude-opus-4.8 --target-lang \"Quechua\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 613,
   "id": "ita-vie-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0483,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00204377,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 614,
   "id": "eng-que-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0483,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00204377,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model anthropic/claude-opus-4.8 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 615,
   "id": "ita-vie-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0483,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00204377,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 616,
   "id": "rus-uzb-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0172,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00204219,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 617,
   "id": "eng-sme-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0292,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00203758,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Northern Sámi\" --yes"
  },
  {
   "priority": 618,
   "id": "eng-guj-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0718,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00203501,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Gujarati\" --yes"
  },
  {
   "priority": 619,
   "id": "eng-fry-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0477,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00202966,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model anthropic/claude-opus-4.8 --target-lang \"Western Frisian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 620,
   "id": "eng-fry-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0477,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00202966,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model anthropic/claude-opus-4.8 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 621,
   "id": "spa-eus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.047,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00201947,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 622,
   "id": "eng-xho-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.047,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00201947,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Xhosa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 623,
   "id": "spa-eus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.047,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00201947,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 624,
   "id": "dan-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0728,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00200705,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 625,
   "id": "dan-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0728,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00200705,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 626,
   "id": "spa-kor-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0464,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00200465,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 627,
   "id": "spa-kor-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0464,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00200465,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 628,
   "id": "eng-pam-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0162,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00199147,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Kapampangan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 629,
   "id": "fra-tgl-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0458,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00199008,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 630,
   "id": "eng-amh-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0458,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00199008,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Amharic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 631,
   "id": "fra-tgl-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0458,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00199008,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 632,
   "id": "deu-ceb-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0452,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00197512,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model anthropic/claude-opus-4.8 --target-lang \"Cebuano\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 633,
   "id": "nld-cat-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0452,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00197512,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 634,
   "id": "deu-ceb-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0452,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00197512,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model anthropic/claude-opus-4.8 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 635,
   "id": "nld-cat-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0452,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00197512,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 636,
   "id": "deu-vie-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.074,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00197451,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 637,
   "id": "deu-vie-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.074,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00197451,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 638,
   "id": "eng-tuk-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.074,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00197451,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Turkmen\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 639,
   "id": "eng-tel-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0445,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00196416,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Telugu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 640,
   "id": "eng-zsm-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0499,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00195013,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Standard Malay\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 641,
   "id": "eng-ltz-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0439,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00194907,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model anthropic/claude-opus-4.8 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 642,
   "id": "por-tgl-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0439,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00194907,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 643,
   "id": "eng-ltz-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0439,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00194907,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model anthropic/claude-opus-4.8 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 644,
   "id": "por-tgl-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0439,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00194907,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 645,
   "id": "eng-pag-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0349,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00194595,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Pangasinan\" --yes"
  },
  {
   "priority": 646,
   "id": "spa-que-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0524,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00194186,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Quechua\" --yes"
  },
  {
   "priority": 647,
   "id": "eng-ceb-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0445,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00193723,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 648,
   "id": "spa-vie-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0433,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00193423,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 649,
   "id": "spa-vie-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0433,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00193423,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 650,
   "id": "eng-cym-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0158,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00193092,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Welsh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 651,
   "id": "por-cat-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0427,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00191898,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 652,
   "id": "eng-lao-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0427,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00191898,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Lao\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 653,
   "id": "eng-pan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0427,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00191898,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Panjabi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 654,
   "id": "por-cat-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0427,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00191898,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 655,
   "id": "jpn-eus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0765,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00190998,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 656,
   "id": "rus-eus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0765,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00190998,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 657,
   "id": "jpn-eus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0765,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00190998,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 658,
   "id": "rus-eus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0765,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00190998,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 659,
   "id": "eng-tam-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0516,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00189721,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Tamil\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 660,
   "id": "deu-ita-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0771,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00189512,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 661,
   "id": "deu-ita-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0771,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00189512,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 662,
   "id": "eng-hil-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0326,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00187796,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Hiligaynon\" --yes"
  },
  {
   "priority": 663,
   "id": "por-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0408,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00187655,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 664,
   "id": "por-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0408,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00187655,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 665,
   "id": "eng-sna-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0231,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00187481,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Shona\" --yes"
  },
  {
   "priority": 666,
   "id": "eng-ilo-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0478,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00186463,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Ilocano\" --yes"
  },
  {
   "priority": 667,
   "id": "deu-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0784,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00186369,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 668,
   "id": "ita-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0784,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00186369,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 669,
   "id": "por-rus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0784,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00186369,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 670,
   "id": "deu-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0784,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00186369,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 671,
   "id": "ita-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0784,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00186369,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 672,
   "id": "por-rus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0784,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00186369,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 673,
   "id": "eng-kaz-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0491,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00186287,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Kazakh\" --yes"
  },
  {
   "priority": 674,
   "id": "eng-bos-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0247,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00185984,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Bosnian\" --yes"
  },
  {
   "priority": 675,
   "id": "ita-cat-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0395,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00184954,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 676,
   "id": "ita-cat-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0395,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00184954,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 677,
   "id": "eng-fao-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0389,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00183374,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model anthropic/claude-opus-4.8 --target-lang \"Faroese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 678,
   "id": "eng-fao-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0389,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00183374,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model anthropic/claude-opus-4.8 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 679,
   "id": "ita-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0797,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00183329,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 680,
   "id": "ita-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0797,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00183329,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 681,
   "id": "eng-arb-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0799,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0018287,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Standard Arabic\" --yes"
  },
  {
   "priority": 682,
   "id": "spa-tgl-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0383,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00181745,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 683,
   "id": "spa-tgl-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0383,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00181745,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 684,
   "id": "eng-pag-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0376,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00180621,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Pangasinan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 685,
   "id": "eng-mal-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0263,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00180225,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Malayalam\" --yes"
  },
  {
   "priority": 686,
   "id": "eng-kan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0383,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00179914,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Kannada\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 687,
   "id": "eng-mon-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0465,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00179735,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Mongolian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 688,
   "id": "eng-jpn-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0819,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00178405,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Japanese\" --yes"
  },
  {
   "priority": 689,
   "id": "spa-fra-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0822,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00177754,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model anthropic/claude-opus-4.8 --target-lang \"French\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 690,
   "id": "spa-fra-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0822,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00177754,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model anthropic/claude-opus-4.8 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 691,
   "id": "eng-amh-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0515,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00176982,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Amharic\" --yes"
  },
  {
   "priority": 692,
   "id": "eng-haw-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0654,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00175604,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Hawaiian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 693,
   "id": "spa-nld-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0834,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00175196,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 694,
   "id": "spa-nld-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0834,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00175196,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 695,
   "id": "nld-ltz-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0351,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.0017442,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model anthropic/claude-opus-4.8 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 696,
   "id": "eng-hil-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0351,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.0017442,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Hiligaynon\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 697,
   "id": "nld-ltz-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0351,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.0017442,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model anthropic/claude-opus-4.8 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 698,
   "id": "nld-fry-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0265,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00173902,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Western Frisian\" --yes"
  },
  {
   "priority": 699,
   "id": "spa-deu-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0847,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00172507,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model anthropic/claude-opus-4.8 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 700,
   "id": "spa-deu-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0847,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00172507,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model anthropic/claude-opus-4.8 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 701,
   "id": "eng-kor-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.085,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00171898,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Korean\" --yes"
  },
  {
   "priority": 702,
   "id": "spa-que-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0596,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00170727,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 703,
   "id": "fra-nld-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0859,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00170097,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 704,
   "id": "fra-nld-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0859,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00170097,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 705,
   "id": "eng-mya-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0335,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00169753,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Burmese\" --yes"
  },
  {
   "priority": 706,
   "id": "fra-ita-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0866,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00168722,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 707,
   "id": "fra-ita-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0866,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00168722,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 708,
   "id": "eng-tha-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0872,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00167561,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Thai\" --yes"
  },
  {
   "priority": 709,
   "id": "spa-rus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0872,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00167561,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 710,
   "id": "spa-rus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0872,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00167561,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 711,
   "id": "fra-eus-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0199,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00166378,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 712,
   "id": "rus-dan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0884,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00165287,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 713,
   "id": "rus-dan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0884,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00165287,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 714,
   "id": "eng-yor-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0391,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00165097,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Yoruba\" --yes"
  },
  {
   "priority": 715,
   "id": "eng-tam-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0593,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00165086,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Tamil\" --yes"
  },
  {
   "priority": 716,
   "id": "eng-uzb-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0563,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00164021,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 717,
   "id": "eng-vie-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0891,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00163988,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Vietnamese\" --yes"
  },
  {
   "priority": 718,
   "id": "fra-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0891,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00163988,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 719,
   "id": "spa-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0891,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00163988,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 720,
   "id": "fra-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0891,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00163988,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 721,
   "id": "spa-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0891,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00163988,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 722,
   "id": "eng-sme-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0364,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00163454,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Northern Sámi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 723,
   "id": "fra-por-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0897,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00162891,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model anthropic/claude-opus-4.8 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 724,
   "id": "spa-por-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0897,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00162891,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model anthropic/claude-opus-4.8 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 725,
   "id": "fra-por-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0897,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00162891,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model anthropic/claude-opus-4.8 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 726,
   "id": "spa-por-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0897,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00162891,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model anthropic/claude-opus-4.8 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 727,
   "id": "eng-hau-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0607,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00162723,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 728,
   "id": "eng-pam-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0199,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.0016212,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Kapampangan\" --yes"
  },
  {
   "priority": 729,
   "id": "fra-hau-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0566,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00161602,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 730,
   "id": "fra-eus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0208,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00159179,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes"
  },
  {
   "priority": 731,
   "id": "eng-mlt-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0588,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00159035,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 732,
   "id": "eng-war-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0594,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00158412,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Waray\" --yes"
  },
  {
   "priority": 733,
   "id": "eng-ceb-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.055,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.0015674,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Cebuano\" --yes"
  },
  {
   "priority": 734,
   "id": "por-glg-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.064,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00155245,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 735,
   "id": "fra-deu-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0947,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00154291,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model anthropic/claude-opus-4.8 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 736,
   "id": "fra-deu-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0947,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00154291,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model anthropic/claude-opus-4.8 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 737,
   "id": "fra-cat-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0192,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00153876,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 738,
   "id": "eng-urd-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0582,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00152641,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Urdu\" --yes"
  },
  {
   "priority": 739,
   "id": "fra-cat-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0195,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00151508,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes"
  },
  {
   "priority": 740,
   "id": "rus-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0966,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00151256,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 741,
   "id": "rus-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0966,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00151256,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 742,
   "id": "eng-yor-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0427,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00151178,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Yoruba\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 743,
   "id": "eng-sin-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0424,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00150869,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Sinhala\" --yes"
  },
  {
   "priority": 744,
   "id": "fra-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0972,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00150322,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 745,
   "id": "fra-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0972,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00150322,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 746,
   "id": "eng-tel-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0584,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00149666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Telugu\" --yes"
  },
  {
   "priority": 747,
   "id": "eng-lug-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0617,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00149192,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Ganda\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 748,
   "id": "dan-fao-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0968,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00149132,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Faroese\" --yes"
  },
  {
   "priority": 749,
   "id": "eng-zsm-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0653,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00149022,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Standard Malay\" --yes"
  },
  {
   "priority": 750,
   "id": "ita-rus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00148338,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 751,
   "id": "ita-rus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00148338,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 752,
   "id": "eng-sin-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0433,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00147733,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Sinhala\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 753,
   "id": "ita-mlt-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0607,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00147317,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 754,
   "id": "eng-sna-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0295,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00146807,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Shona\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 755,
   "id": "eng-urd-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.061,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00145634,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Urdu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 756,
   "id": "deu-eus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1004,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00145531,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 757,
   "id": "deu-eus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1004,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00145531,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 758,
   "id": "rus-vie-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.101,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00144667,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 759,
   "id": "rus-vie-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.101,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00144667,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 760,
   "id": "fra-rus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1022,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00142968,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 761,
   "id": "fra-rus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1022,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00142968,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 762,
   "id": "eng-tir-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0339,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.0014232,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Tigrinya\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 763,
   "id": "eng-dan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1035,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00141172,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 764,
   "id": "eng-dan-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1035,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00141172,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model anthropic/claude-opus-4.8 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 765,
   "id": "eng-guj-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1035,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00141172,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Gujarati\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 766,
   "id": "rus-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1054,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00138627,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 767,
   "id": "rus-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1054,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00138627,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 768,
   "id": "deu-ltz-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.0603,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00138602,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 769,
   "id": "eng-ibo-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.022,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00137479,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Igbo\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 770,
   "id": "dan-fao-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1054,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00136964,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 771,
   "id": "eng-ilo-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0659,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00135249,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Ilocano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 772,
   "id": "eng-kan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0511,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00134847,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Kannada\" --yes"
  },
  {
   "priority": 773,
   "id": "rus-uzb-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0261,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00134581,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 774,
   "id": "eng-mon-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0626,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00133509,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Mongolian\" --yes"
  },
  {
   "priority": 775,
   "id": "eng-cmn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1095,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00133437,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes"
  },
  {
   "priority": 776,
   "id": "eng-amh-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0684,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00133254,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Amharic\" --yes"
  },
  {
   "priority": 777,
   "id": "deu-por-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00132589,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model google/gemini-3.5-flash --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 778,
   "id": "eng-glg-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00132589,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model google/gemini-3.5-flash --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 779,
   "id": "eng-tuk-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1102,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00132589,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Turkmen\" --yes"
  },
  {
   "priority": 780,
   "id": "spa-ita-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00132589,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 781,
   "id": "deu-por-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00132589,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model google/gemini-3.5-flash --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 782,
   "id": "eng-glg-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00132589,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model google/gemini-3.5-flash --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 783,
   "id": "spa-ita-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00132589,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 784,
   "id": "cmn-kor-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.111,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00131634,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 785,
   "id": "cmn-kor-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.111,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00131634,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 786,
   "id": "fra-ltz-dev-v1__anthropic_claude-sonnet-4.6__coached",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "coached",
   "est_cost_usd": 0.066,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00131502,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 787,
   "id": "eng-uzb-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0703,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00131357,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 788,
   "id": "eng-eus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00131279,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 789,
   "id": "jpn-kor-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00131279,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 790,
   "id": "nld-rus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00131279,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 791,
   "id": "eng-eus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00131279,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 792,
   "id": "jpn-kor-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00131279,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 793,
   "id": "nld-rus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1113,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00131279,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 794,
   "id": "spa-glg-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.108,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00131259,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model google/gemini-3.5-flash --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 795,
   "id": "spa-glg-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.108,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00131259,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model google/gemini-3.5-flash --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 796,
   "id": "eng-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1118,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00130692,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes"
  },
  {
   "priority": 797,
   "id": "deu-cat-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1124,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00129994,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 798,
   "id": "deu-cat-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1124,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00129994,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 799,
   "id": "eng-cym-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0236,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00129273,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Welsh\" --yes"
  },
  {
   "priority": 800,
   "id": "eng-kaz-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0709,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00129008,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 801,
   "id": "nld-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1135,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00128734,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 802,
   "id": "nld-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1135,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00128734,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 803,
   "id": "fra-hau-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0713,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00128285,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 804,
   "id": "eng-mal-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.037,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00128106,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Malayalam\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 805,
   "id": "ita-nld-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1146,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00127499,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 806,
   "id": "ita-nld-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1146,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00127499,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 807,
   "id": "eng-ita-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1154,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00126615,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 808,
   "id": "eng-ita-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1154,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00126615,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model anthropic/claude-opus-4.8 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 809,
   "id": "nld-fry-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0364,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00126605,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 810,
   "id": "ita-dan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1003,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00126447,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 811,
   "id": "nld-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1003,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00126447,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 812,
   "id": "ita-dan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1003,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00126447,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 813,
   "id": "nld-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1003,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00126447,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 814,
   "id": "cmn-vie-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1157,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00126286,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 815,
   "id": "cmn-vie-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1157,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00126286,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 816,
   "id": "eng-lug-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0729,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00126271,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Ganda\" --yes"
  },
  {
   "priority": 817,
   "id": "fra-dan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0992,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00125758,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 818,
   "id": "fra-dan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0992,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00125758,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 819,
   "id": "eng-xho-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0758,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00125218,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Xhosa\" --yes"
  },
  {
   "priority": 820,
   "id": "eng-rus-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1167,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00125204,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 821,
   "id": "eng-rus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1167,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00125204,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model anthropic/claude-opus-4.8 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 822,
   "id": "fra-vie-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.097,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00124362,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 823,
   "id": "por-dan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.097,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00124362,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 824,
   "id": "fra-vie-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.097,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00124362,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 825,
   "id": "por-dan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.097,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00124362,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 826,
   "id": "deu-tgl-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1179,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0012393,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 827,
   "id": "deu-tgl-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1179,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0012393,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 828,
   "id": "eng-cat-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.119,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00122784,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 829,
   "id": "eng-cat-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.119,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00122784,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 830,
   "id": "deu-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0937,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00122193,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 831,
   "id": "spa-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0937,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00122193,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 832,
   "id": "deu-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0937,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00122193,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 833,
   "id": "spa-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0937,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00122193,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 834,
   "id": "cmn-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1201,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0012166,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 835,
   "id": "cmn-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1201,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0012166,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 836,
   "id": "dan-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0926,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00121467,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 837,
   "id": "dan-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0926,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00121467,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 838,
   "id": "spa-cat-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0915,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00120755,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 839,
   "id": "spa-cat-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0915,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00120755,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 840,
   "id": "spa-dan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1223,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00119471,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 841,
   "id": "spa-dan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1223,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00119471,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 842,
   "id": "fra-kor-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0893,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.0011928,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 843,
   "id": "fra-kor-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0893,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.0011928,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 844,
   "id": "eng-tir-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0405,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00119128,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Tigrinya\" --yes"
  },
  {
   "priority": 845,
   "id": "eng-guj-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1227,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00119082,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Gujarati\" --yes"
  },
  {
   "priority": 846,
   "id": "deu-kor-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0882,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00118547,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 847,
   "id": "nld-dan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0882,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00118547,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 848,
   "id": "por-ita-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0882,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00118547,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 849,
   "id": "por-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0882,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00118547,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 850,
   "id": "deu-kor-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0882,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00118547,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 851,
   "id": "nld-dan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0882,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00118547,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 852,
   "id": "por-ita-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0882,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00118547,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 853,
   "id": "por-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0882,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00118547,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 854,
   "id": "deu-nld-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1234,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00118406,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 855,
   "id": "deu-nld-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1234,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00118406,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 856,
   "id": "eng-zul-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1234,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00118406,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Zulu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 857,
   "id": "eng-ibo-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0256,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00118146,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Igbo\" --yes"
  },
  {
   "priority": 858,
   "id": "por-nld-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0871,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00117797,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 859,
   "id": "por-nld-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0871,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00117797,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 860,
   "id": "eng-mya-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0483,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00117738,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Burmese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 861,
   "id": "rus-kor-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1242,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00117644,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 862,
   "id": "rus-kor-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1242,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00117644,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 863,
   "id": "eng-zul-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1244,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00117454,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Zulu\" --yes"
  },
  {
   "priority": 864,
   "id": "deu-dan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1245,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0011736,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 865,
   "id": "deu-rus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1245,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0011736,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 866,
   "id": "deu-dan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1245,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0011736,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 867,
   "id": "deu-rus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1245,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0011736,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 868,
   "id": "jpn-vie-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 869,
   "id": "rus-kaz-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model anthropic/claude-opus-4.8 --target-lang \"Kazakh\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 870,
   "id": "eng-arb-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Standard Arabic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 871,
   "id": "eng-cmn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 872,
   "id": "eng-deu-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 873,
   "id": "eng-fra-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 874,
   "id": "eng-jpn-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 875,
   "id": "eng-kor-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 876,
   "id": "eng-nld-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 877,
   "id": "eng-por-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 878,
   "id": "eng-spa-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Spanish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 879,
   "id": "eng-tgl-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 880,
   "id": "eng-tha-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Thai\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 881,
   "id": "eng-vie-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 882,
   "id": "jpn-vie-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 883,
   "id": "rus-kaz-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1254,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00116518,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model anthropic/claude-opus-4.8 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 884,
   "id": "eng-tel-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0751,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00116385,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Telugu\" --yes"
  },
  {
   "priority": 885,
   "id": "eng-que-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00116271,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model google/gemini-3.5-flash --target-lang \"Quechua\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 886,
   "id": "ita-vie-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00116271,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 887,
   "id": "eng-que-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00116271,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model google/gemini-3.5-flash --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 888,
   "id": "ita-vie-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00116271,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 889,
   "id": "eng-nld-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1257,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0011624,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Dutch\" --yes"
  },
  {
   "priority": 890,
   "id": "eng-lao-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0707,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00115899,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Lao\" --yes"
  },
  {
   "priority": 891,
   "id": "eng-mlt-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0809,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.0011559,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 892,
   "id": "eng-mya-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0492,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00115584,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Burmese\" --yes"
  },
  {
   "priority": 893,
   "id": "eng-fry-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0838,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00115531,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model google/gemini-3.5-flash --target-lang \"Western Frisian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 894,
   "id": "eng-fry-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0838,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00115531,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model google/gemini-3.5-flash --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 895,
   "id": "spa-eus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0827,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00114771,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 896,
   "id": "eng-xho-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0827,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00114771,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Xhosa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 897,
   "id": "spa-eus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0827,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00114771,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 898,
   "id": "eng-bos-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0401,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00114559,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Bosnian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 899,
   "id": "eng-war-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0822,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00114473,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Waray\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 900,
   "id": "dan-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1279,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0011424,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 901,
   "id": "dan-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1279,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0011424,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 902,
   "id": "eng-haw-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1007,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00114047,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Hawaiian\" --yes"
  },
  {
   "priority": 903,
   "id": "spa-kor-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0816,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.0011399,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 904,
   "id": "spa-kor-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0816,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.0011399,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 905,
   "id": "fra-tgl-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0805,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00113224,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 906,
   "id": "eng-amh-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0805,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00113224,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Amharic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 907,
   "id": "fra-tgl-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0805,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00113224,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 908,
   "id": "eng-hau-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0878,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00112497,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 909,
   "id": "deu-ceb-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0794,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00112437,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model google/gemini-3.5-flash --target-lang \"Cebuano\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 910,
   "id": "nld-cat-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0794,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00112437,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 911,
   "id": "deu-ceb-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0794,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00112437,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model google/gemini-3.5-flash --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 912,
   "id": "nld-cat-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0794,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00112437,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 913,
   "id": "deu-vie-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1301,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00112309,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 914,
   "id": "deu-vie-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1301,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00112309,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 915,
   "id": "eng-tuk-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1301,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00112309,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Turkmen\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 916,
   "id": "eng-por-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1304,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0011205,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Portuguese (Brazilian)\" --yes"
  },
  {
   "priority": 917,
   "id": "eng-tel-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0783,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00111628,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Telugu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 918,
   "id": "deu-por-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1314,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00111197,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model google/gemini-3.1-pro-preview --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 919,
   "id": "eng-glg-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1314,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00111197,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model google/gemini-3.1-pro-preview --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 920,
   "id": "spa-ita-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1314,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00111197,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 921,
   "id": "deu-por-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1314,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00111197,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model google/gemini-3.1-pro-preview --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 922,
   "id": "eng-glg-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1314,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00111197,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model google/gemini-3.1-pro-preview --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 923,
   "id": "spa-ita-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1314,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00111197,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 924,
   "id": "ita-mlt-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0805,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00111082,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 925,
   "id": "eng-ltz-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0772,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00110834,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model google/gemini-3.5-flash --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 926,
   "id": "por-tgl-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0772,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00110834,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 927,
   "id": "eng-ltz-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0772,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00110834,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model google/gemini-3.5-flash --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 928,
   "id": "por-tgl-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0772,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00110834,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 929,
   "id": "spa-vie-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.076,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.001102,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 930,
   "id": "spa-vie-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.076,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.001102,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 931,
   "id": "spa-glg-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1287,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00110147,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model google/gemini-3.1-pro-preview --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 932,
   "id": "spa-glg-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1287,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00110147,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model google/gemini-3.1-pro-preview --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 933,
   "id": "eng-eus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00110108,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 934,
   "id": "jpn-kor-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00110108,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 935,
   "id": "nld-rus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00110108,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 936,
   "id": "eng-eus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00110108,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 937,
   "id": "jpn-kor-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00110108,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 938,
   "id": "nld-rus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00110108,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 939,
   "id": "eng-tel-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0795,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00109943,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model openai/gpt-5.5 --target-lang \"Telugu\" --yes"
  },
  {
   "priority": 940,
   "id": "rus-uzb-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.032,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00109768,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 941,
   "id": "por-cat-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.001094,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 942,
   "id": "eng-lao-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.001094,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Lao\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 943,
   "id": "eng-pan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.001094,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Panjabi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 944,
   "id": "por-cat-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.001094,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 945,
   "id": "deu-cat-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.134,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0010904,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 946,
   "id": "deu-cat-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.134,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0010904,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 947,
   "id": "jpn-eus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1345,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00108634,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 948,
   "id": "rus-eus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1345,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00108634,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 949,
   "id": "jpn-eus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1345,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00108634,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 950,
   "id": "rus-eus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1345,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00108634,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 951,
   "id": "nld-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1353,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00107992,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 952,
   "id": "nld-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1353,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00107992,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 953,
   "id": "deu-ita-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1356,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00107753,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 954,
   "id": "deu-ita-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1356,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00107753,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 955,
   "id": "eng-pam-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0301,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00107182,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Kapampangan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 956,
   "id": "ita-nld-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1366,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106964,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 957,
   "id": "ita-nld-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1366,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106964,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 958,
   "id": "por-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0716,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00106932,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 959,
   "id": "por-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0716,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00106932,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 960,
   "id": "eng-arb-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.137,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106652,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Standard Arabic\" --yes"
  },
  {
   "priority": 961,
   "id": "eng-mal-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0445,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00106515,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Malayalam\" --yes"
  },
  {
   "priority": 962,
   "id": "ita-dan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1196,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00106042,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 963,
   "id": "nld-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1196,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00106042,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 964,
   "id": "ita-dan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1196,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00106042,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 965,
   "id": "nld-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1196,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00106042,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 966,
   "id": "deu-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1378,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106033,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 967,
   "id": "ita-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1378,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106033,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 968,
   "id": "por-rus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1378,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106033,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 969,
   "id": "deu-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1378,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106033,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 970,
   "id": "ita-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1378,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106033,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 971,
   "id": "por-rus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1378,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00106033,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 972,
   "id": "cmn-vie-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1379,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00105956,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 973,
   "id": "cmn-vie-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1379,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00105956,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 974,
   "id": "eng-spa-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1383,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0010565,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Spanish\" --yes"
  },
  {
   "priority": 975,
   "id": "fra-dan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1182,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00105543,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 976,
   "id": "fra-dan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1182,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00105543,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 977,
   "id": "ita-cat-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0694,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00105269,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 978,
   "id": "ita-cat-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0694,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00105269,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 979,
   "id": "eng-fra-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.139,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00105118,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"French\" --yes"
  },
  {
   "priority": 980,
   "id": "eng-tgl-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.139,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00105118,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Filipino\" --yes"
  },
  {
   "priority": 981,
   "id": "eng-zsm-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0928,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00104862,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Standard Malay\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 982,
   "id": "eng-fao-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0683,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.0010444,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model google/gemini-3.5-flash --target-lang \"Faroese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 983,
   "id": "eng-fao-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0683,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.0010444,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model google/gemini-3.5-flash --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 984,
   "id": "ita-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.14,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00104367,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 985,
   "id": "ita-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.14,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00104367,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 986,
   "id": "fra-vie-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1156,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00104352,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 987,
   "id": "por-dan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1156,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00104352,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 988,
   "id": "fra-vie-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1156,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00104352,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 989,
   "id": "por-dan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1156,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00104352,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 990,
   "id": "fra-ltz-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0833,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00104191,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 991,
   "id": "eng-ceb-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0828,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00104115,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 992,
   "id": "deu-tgl-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1406,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00103921,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 993,
   "id": "deu-tgl-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1406,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00103921,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 994,
   "id": "eng-jpn-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1407,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00103847,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Japanese\" --yes"
  },
  {
   "priority": 995,
   "id": "deu-ltz-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0805,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00103822,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 996,
   "id": "spa-tgl-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0672,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00103584,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 997,
   "id": "spa-tgl-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0672,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00103584,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 998,
   "id": "eng-cym-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0295,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00103419,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Welsh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 999,
   "id": "eng-cat-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1419,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00102969,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1000,
   "id": "eng-cat-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1419,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00102969,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1001,
   "id": "eng-kan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.067,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00102846,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model openai/gpt-5.5 --target-lang \"Kannada\" --yes"
  },
  {
   "priority": 1002,
   "id": "eng-pag-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0661,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00102744,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Pangasinan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1003,
   "id": "eng-zul-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1423,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0010268,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Zulu\" --yes"
  },
  {
   "priority": 1004,
   "id": "eng-tha-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1424,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00102608,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model openai/gpt-5.5 --target-lang \"Thai\" --yes"
  },
  {
   "priority": 1005,
   "id": "eng-kan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0672,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.0010254,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Kannada\" --yes"
  },
  {
   "priority": 1006,
   "id": "eng-kan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0672,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.0010254,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Kannada\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1007,
   "id": "deu-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1117,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00102502,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1008,
   "id": "spa-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1117,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00102502,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1009,
   "id": "deu-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1117,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00102502,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1010,
   "id": "spa-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1117,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00102502,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1011,
   "id": "cmn-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1432,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00102034,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1012,
   "id": "eng-kor-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1432,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00102034,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Korean\" --yes"
  },
  {
   "priority": 1013,
   "id": "cmn-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1432,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00102034,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1014,
   "id": "eng-tam-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.096,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00101975,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Tamil\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1015,
   "id": "dan-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1104,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00101882,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1016,
   "id": "dan-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1104,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00101882,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1017,
   "id": "spa-cat-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.109,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00101368,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1018,
   "id": "spa-cat-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.109,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00101368,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1019,
   "id": "spa-fra-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1444,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00101187,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model google/gemini-3.5-flash --target-lang \"French\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1020,
   "id": "spa-fra-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1444,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00101187,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model google/gemini-3.5-flash --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1021,
   "id": "eng-pan-dev-v1__anthropic_claude-sonnet-4.6__naive",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-sonnet-4.6",
   "condition": "naive",
   "est_cost_usd": 0.0814,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00100664,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model anthropic/claude-sonnet-4.6 --target-lang \"Panjabi\" --yes"
  },
  {
   "priority": 1022,
   "id": "spa-dan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1458,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00100215,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1023,
   "id": "spa-dan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1458,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00100215,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1024,
   "id": "fra-kor-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1064,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.0010011,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1025,
   "id": "fra-kor-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1064,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.0010011,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1026,
   "id": "spa-nld-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1466,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00099668,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1027,
   "id": "spa-nld-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1466,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00099668,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1028,
   "id": "eng-xho-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0954,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00099492,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Xhosa\" --yes"
  },
  {
   "priority": 1029,
   "id": "deu-kor-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1051,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00099485,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1030,
   "id": "nld-dan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1051,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00099485,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1031,
   "id": "por-ita-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1051,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00099485,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1032,
   "id": "por-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1051,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00099485,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1033,
   "id": "deu-kor-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1051,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00099485,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1034,
   "id": "nld-dan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1051,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00099485,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1035,
   "id": "por-ita-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1051,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00099485,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1036,
   "id": "por-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1051,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00099485,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1037,
   "id": "deu-nld-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1471,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00099329,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1038,
   "id": "deu-nld-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1471,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00099329,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1039,
   "id": "eng-zul-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1471,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00099329,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Zulu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1040,
   "id": "nld-ltz-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0617,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00099224,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model google/gemini-3.5-flash --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1041,
   "id": "eng-hil-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0617,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00099224,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Hiligaynon\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1042,
   "id": "nld-ltz-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0617,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00099224,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model google/gemini-3.5-flash --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1043,
   "id": "eng-spa-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1477,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00098926,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model openai/gpt-5.5 --target-lang \"Spanish\" --yes"
  },
  {
   "priority": 1044,
   "id": "por-nld-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1038,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00098845,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1045,
   "id": "por-nld-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1038,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00098845,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1046,
   "id": "deu-dan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1485,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00098393,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1047,
   "id": "deu-rus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1485,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00098393,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1048,
   "id": "deu-dan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1485,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00098393,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1049,
   "id": "deu-rus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1485,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00098393,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1050,
   "id": "eng-tha-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1486,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00098327,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Thai\" --yes"
  },
  {
   "priority": 1051,
   "id": "spa-deu-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1488,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00098194,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model google/gemini-3.5-flash --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1052,
   "id": "spa-deu-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1488,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00098194,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model google/gemini-3.5-flash --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1053,
   "id": "eng-que-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00097544,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model google/gemini-3.1-pro-preview --target-lang \"Quechua\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1054,
   "id": "ita-vie-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00097544,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1055,
   "id": "eng-que-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00097544,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model google/gemini-3.1-pro-preview --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1056,
   "id": "ita-vie-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00097544,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1057,
   "id": "spa-que-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1047,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00097186,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1058,
   "id": "eng-kaz-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0942,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00097099,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model openai/gpt-5.5 --target-lang \"Kazakh\" --yes"
  },
  {
   "priority": 1059,
   "id": "eng-fry-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0998,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00097009,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model google/gemini-3.1-pro-preview --target-lang \"Western Frisian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1060,
   "id": "eng-fry-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0998,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00097009,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model google/gemini-3.1-pro-preview --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1061,
   "id": "eng-por-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1507,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00096956,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model openai/gpt-5.5 --target-lang \"Portuguese (Brazilian)\" --yes"
  },
  {
   "priority": 1062,
   "id": "fra-nld-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.151,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00096764,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1063,
   "id": "fra-nld-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.151,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00096764,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1064,
   "id": "eng-mon-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0866,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00096509,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Mongolian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1065,
   "id": "spa-eus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00096361,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1066,
   "id": "eng-xho-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00096361,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Xhosa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1067,
   "id": "spa-eus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00096361,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1068,
   "id": "eng-tam-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1018,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00096165,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Tamil\" --yes"
  },
  {
   "priority": 1069,
   "id": "eng-bos-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0478,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00096105,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Bosnian\" --yes"
  },
  {
   "priority": 1070,
   "id": "fra-ita-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1521,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00096064,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1071,
   "id": "fra-ita-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1521,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00096064,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1072,
   "id": "eng-amh-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.095,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00095943,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Amharic\" --yes"
  },
  {
   "priority": 1073,
   "id": "dan-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1524,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00095875,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1074,
   "id": "dan-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1524,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00095875,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1075,
   "id": "spa-kor-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0972,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00095695,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1076,
   "id": "spa-kor-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0972,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00095695,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1077,
   "id": "spa-rus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1532,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00095374,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1078,
   "id": "spa-rus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1532,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00095374,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1079,
   "id": "fra-tgl-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0959,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00095042,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1080,
   "id": "eng-amh-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0959,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00095042,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Amharic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1081,
   "id": "fra-tgl-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0959,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00095042,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1082,
   "id": "eng-kaz-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0965,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00094784,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Kazakh\" --yes"
  },
  {
   "priority": 1083,
   "id": "deu-ceb-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0946,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00094371,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model google/gemini-3.1-pro-preview --target-lang \"Cebuano\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1084,
   "id": "nld-cat-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0946,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00094371,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1085,
   "id": "deu-ceb-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0946,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00094371,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model google/gemini-3.1-pro-preview --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1086,
   "id": "nld-cat-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0946,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00094371,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1087,
   "id": "eng-haw-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1217,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00094367,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Hawaiian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1088,
   "id": "deu-vie-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.155,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00094267,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1089,
   "id": "deu-vie-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.155,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00094267,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1090,
   "id": "eng-tuk-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.155,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00094267,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Turkmen\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1091,
   "id": "rus-dan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1554,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00094024,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1092,
   "id": "rus-dan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1554,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00094024,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1093,
   "id": "eng-deu-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1555,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00093964,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"German\" --yes"
  },
  {
   "priority": 1094,
   "id": "eng-tel-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0933,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00093682,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Telugu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1095,
   "id": "fra-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1565,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00093363,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1096,
   "id": "spa-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1565,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00093363,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1097,
   "id": "fra-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1565,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00093363,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1098,
   "id": "spa-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1565,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00093363,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1099,
   "id": "eng-sme-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0639,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.0009311,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Northern Sámi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1100,
   "id": "eng-ltz-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.092,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00093004,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model google/gemini-3.1-pro-preview --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1101,
   "id": "por-tgl-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.092,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00093004,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1102,
   "id": "eng-ltz-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.092,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00093004,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model google/gemini-3.1-pro-preview --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1103,
   "id": "por-tgl-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.092,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00093004,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1104,
   "id": "fra-por-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1576,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00092712,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model google/gemini-3.5-flash --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1105,
   "id": "spa-por-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1576,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00092712,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model google/gemini-3.5-flash --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1106,
   "id": "fra-por-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1576,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00092712,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model google/gemini-3.5-flash --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1107,
   "id": "spa-por-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1576,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00092712,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model google/gemini-3.5-flash --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1108,
   "id": "spa-vie-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0906,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00092442,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1109,
   "id": "spa-vie-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0906,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00092442,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1110,
   "id": "eng-tel-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.095,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00092005,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Telugu\" --yes"
  },
  {
   "priority": 1111,
   "id": "eng-pan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0891,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00091965,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Panjabi\" --yes"
  },
  {
   "priority": 1112,
   "id": "por-cat-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0893,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00091759,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1113,
   "id": "eng-lao-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0893,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00091759,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Lao\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1114,
   "id": "eng-pan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0893,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00091759,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Panjabi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1115,
   "id": "por-cat-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0893,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00091759,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1116,
   "id": "spa-que-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1109,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00091752,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Quechua\" --yes"
  },
  {
   "priority": 1117,
   "id": "deu-por-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1598,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00091435,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model openai/gpt-5.5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1118,
   "id": "eng-glg-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1598,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00091435,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model openai/gpt-5.5 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1119,
   "id": "spa-ita-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1598,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00091435,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1120,
   "id": "deu-por-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1598,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00091435,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model openai/gpt-5.5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1121,
   "id": "eng-glg-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1598,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00091435,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model openai/gpt-5.5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1122,
   "id": "spa-ita-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1598,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00091435,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1123,
   "id": "eng-vie-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1601,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00091264,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes"
  },
  {
   "priority": 1124,
   "id": "jpn-eus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1603,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0009115,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1125,
   "id": "rus-eus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1603,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0009115,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1126,
   "id": "jpn-eus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1603,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0009115,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1127,
   "id": "rus-eus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1603,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0009115,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1128,
   "id": "eng-kan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0756,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00091147,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Kannada\" --yes"
  },
  {
   "priority": 1129,
   "id": "eng-tha-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.161,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090754,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Thai\" --yes"
  },
  {
   "priority": 1130,
   "id": "eng-eus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1614,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090529,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1131,
   "id": "jpn-kor-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1614,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090529,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1132,
   "id": "nld-rus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1614,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090529,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1133,
   "id": "eng-eus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1614,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090529,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1134,
   "id": "jpn-kor-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1614,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090529,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1135,
   "id": "nld-rus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1614,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090529,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1136,
   "id": "spa-glg-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1566,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00090523,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model openai/gpt-5.5 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1137,
   "id": "spa-glg-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1566,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00090523,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model openai/gpt-5.5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1138,
   "id": "deu-ita-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1616,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090417,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1139,
   "id": "deu-ita-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1616,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090417,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1140,
   "id": "eng-vie-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1623,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00090027,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Vietnamese\" --yes"
  },
  {
   "priority": 1141,
   "id": "eng-urd-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0989,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00089825,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Urdu\" --yes"
  },
  {
   "priority": 1142,
   "id": "eng-vie-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1629,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00089695,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes"
  },
  {
   "priority": 1143,
   "id": "por-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0854,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00089653,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1144,
   "id": "por-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0854,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00089653,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1145,
   "id": "deu-cat-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0008964,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1146,
   "id": "deu-cat-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0008964,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1147,
   "id": "fra-eus-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.037,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00089485,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1148,
   "id": "eng-nld-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1637,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00089257,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes"
  },
  {
   "priority": 1149,
   "id": "deu-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1642,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00088985,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1150,
   "id": "ita-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1642,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00088985,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1151,
   "id": "por-rus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1642,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00088985,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1152,
   "id": "deu-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1642,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00088985,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1153,
   "id": "ita-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1642,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00088985,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1154,
   "id": "por-rus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1642,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00088985,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1155,
   "id": "nld-fry-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0518,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00088966,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Western Frisian\" --yes"
  },
  {
   "priority": 1156,
   "id": "eng-bos-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.0517,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00088855,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Bosnian\" --yes"
  },
  {
   "priority": 1157,
   "id": "eng-amh-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1026,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00088836,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Amharic\" --yes"
  },
  {
   "priority": 1158,
   "id": "nld-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1646,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00088769,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1159,
   "id": "nld-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1646,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00088769,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1160,
   "id": "por-glg-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1124,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00088396,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1161,
   "id": "ita-cat-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0828,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00088233,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1162,
   "id": "ita-cat-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0828,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00088233,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1163,
   "id": "eng-sna-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0491,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00088204,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Shona\" --yes"
  },
  {
   "priority": 1164,
   "id": "eng-uzb-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1047,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00088198,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1165,
   "id": "eng-pag-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0772,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00087971,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Pangasinan\" --yes"
  },
  {
   "priority": 1166,
   "id": "ita-nld-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1662,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087914,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1167,
   "id": "ita-nld-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1662,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087914,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1168,
   "id": "fra-deu-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1664,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087809,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model google/gemini-3.5-flash --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1169,
   "id": "fra-deu-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1664,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087809,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model google/gemini-3.5-flash --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1170,
   "id": "eng-tuk-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1667,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087651,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Turkmen\" --yes"
  },
  {
   "priority": 1171,
   "id": "ita-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1668,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087598,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1172,
   "id": "ita-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1668,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087598,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1173,
   "id": "eng-fao-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0815,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00087525,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model google/gemini-3.1-pro-preview --target-lang \"Faroese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1174,
   "id": "eng-fao-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0815,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00087525,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model google/gemini-3.1-pro-preview --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1175,
   "id": "eng-yor-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0739,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00087352,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Yoruba\" --yes"
  },
  {
   "priority": 1176,
   "id": "ita-dan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1454,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00087226,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1177,
   "id": "nld-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1454,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00087226,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1178,
   "id": "ita-dan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1454,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00087226,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1179,
   "id": "nld-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1454,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00087226,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1180,
   "id": "cmn-vie-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1678,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087076,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1181,
   "id": "cmn-vie-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1678,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00087076,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1182,
   "id": "spa-tgl-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0801,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00086902,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1183,
   "id": "spa-tgl-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0801,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00086902,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1184,
   "id": "fra-hau-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1054,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00086781,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1185,
   "id": "fra-dan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1438,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00086754,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1186,
   "id": "fra-dan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1438,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00086754,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1187,
   "id": "eng-mal-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0547,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00086653,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Malayalam\" --yes"
  },
  {
   "priority": 1188,
   "id": "eng-fra-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1693,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00086304,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model openai/gpt-5.5 --target-lang \"French\" --yes"
  },
  {
   "priority": 1189,
   "id": "eng-yor-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00086185,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Yoruba\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1190,
   "id": "eng-pag-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0788,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00086185,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Pangasinan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1191,
   "id": "rus-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1697,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00086101,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1192,
   "id": "rus-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1697,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00086101,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1193,
   "id": "eng-kan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0801,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00086026,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Kannada\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1194,
   "id": "fra-vie-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1406,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00085797,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1195,
   "id": "por-dan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1406,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00085797,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1196,
   "id": "fra-vie-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1406,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00085797,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1197,
   "id": "por-dan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1406,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00085797,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1198,
   "id": "fra-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1708,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00085546,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1199,
   "id": "fra-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1708,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00085546,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1200,
   "id": "deu-tgl-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.171,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00085446,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model openai/gpt-5.5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1201,
   "id": "deu-tgl-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.171,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00085446,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model openai/gpt-5.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1202,
   "id": "eng-cmn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1712,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00085347,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes"
  },
  {
   "priority": 1203,
   "id": "spa-fra-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1721,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.000849,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model google/gemini-3.1-pro-preview --target-lang \"French\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1204,
   "id": "spa-fra-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1721,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.000849,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model google/gemini-3.1-pro-preview --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1205,
   "id": "eng-cat-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1726,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00084654,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1206,
   "id": "eng-cat-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1726,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00084654,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1207,
   "id": "ita-rus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.173,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00084459,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1208,
   "id": "ita-rus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.173,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00084459,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1209,
   "id": "por-glg-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1178,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00084344,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Galician\" --yes"
  },
  {
   "priority": 1210,
   "id": "deu-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1358,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00084311,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1211,
   "id": "spa-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1358,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00084311,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1212,
   "id": "deu-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1358,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00084311,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1213,
   "id": "spa-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1358,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00084311,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1214,
   "id": "eng-sin-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.076,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00084169,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Sinhala\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1215,
   "id": "eng-hil-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0729,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.0008398,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Hiligaynon\" --yes"
  },
  {
   "priority": 1216,
   "id": "cmn-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1742,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00083877,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1217,
   "id": "cmn-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1742,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00083877,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1218,
   "id": "dan-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1342,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00083814,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1219,
   "id": "dan-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1342,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00083814,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1220,
   "id": "eng-pag-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0812,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00083637,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Pangasinan\" --yes"
  },
  {
   "priority": 1221,
   "id": "spa-nld-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1747,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00083637,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1222,
   "id": "spa-nld-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1747,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00083637,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1223,
   "id": "eng-sna-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0518,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00083606,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Shona\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1224,
   "id": "eng-kor-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1749,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00083541,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model openai/gpt-5.5 --target-lang \"Korean\" --yes"
  },
  {
   "priority": 1225,
   "id": "eng-mal-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0568,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00083449,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model openai/gpt-5.5 --target-lang \"Malayalam\" --yes"
  },
  {
   "priority": 1226,
   "id": "spa-cat-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1326,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00083327,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1227,
   "id": "spa-cat-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1326,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00083327,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1228,
   "id": "nld-ltz-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0736,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00083181,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model google/gemini-3.1-pro-preview --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1229,
   "id": "eng-hil-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0736,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00083181,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Hiligaynon\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1230,
   "id": "nld-ltz-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0736,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00083181,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model google/gemini-3.1-pro-preview --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1231,
   "id": "deu-eus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1763,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00082878,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1232,
   "id": "deu-eus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1763,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00082878,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model google/gemini-3.5-flash --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1233,
   "id": "eng-por-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1767,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0008269,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Portuguese (Brazilian)\" --yes"
  },
  {
   "priority": 1234,
   "id": "fra-cat-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.0358,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00082525,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1235,
   "id": "spa-dan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1773,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0008241,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1236,
   "id": "spa-dan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1773,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0008241,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1237,
   "id": "rus-vie-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1774,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00082364,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1238,
   "id": "spa-deu-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1774,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00082364,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model google/gemini-3.1-pro-preview --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1239,
   "id": "rus-vie-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1774,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00082364,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1240,
   "id": "spa-deu-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1774,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00082364,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model google/gemini-3.1-pro-preview --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1241,
   "id": "fra-kor-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1294,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00082316,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1242,
   "id": "fra-kor-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1294,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00082316,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1243,
   "id": "deu-kor-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1278,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00081814,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1244,
   "id": "nld-dan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1278,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00081814,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1245,
   "id": "por-ita-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1278,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00081814,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1246,
   "id": "por-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1278,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00081814,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1247,
   "id": "deu-kor-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1278,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00081814,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1248,
   "id": "nld-dan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1278,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00081814,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1249,
   "id": "por-ita-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1278,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00081814,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1250,
   "id": "por-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1278,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00081814,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1251,
   "id": "deu-nld-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1789,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00081673,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1252,
   "id": "deu-nld-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1789,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00081673,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1253,
   "id": "eng-zul-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1789,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00081673,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model openai/gpt-5.5 --target-lang \"Zulu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1254,
   "id": "spa-que-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1248,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00081533,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1255,
   "id": "fra-rus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1797,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0008131,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1256,
   "id": "fra-rus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1797,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0008131,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1257,
   "id": "por-nld-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1262,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.000813,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1258,
   "id": "por-nld-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1262,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.000813,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1259,
   "id": "fra-nld-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.18,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00081174,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1260,
   "id": "fra-nld-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.18,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00081174,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1261,
   "id": "eng-hil-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0755,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00081088,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Hiligaynon\" --yes"
  },
  {
   "priority": 1262,
   "id": "eng-tir-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0595,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00081087,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Tigrinya\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1263,
   "id": "eng-kan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.0851,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00080972,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Kannada\" --yes"
  },
  {
   "priority": 1264,
   "id": "deu-dan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1805,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080949,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1265,
   "id": "deu-rus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1805,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080949,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1266,
   "id": "deu-dan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1805,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080949,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1267,
   "id": "deu-rus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1805,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080949,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1268,
   "id": "fra-ita-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1813,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080592,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1269,
   "id": "fra-ita-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1813,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080592,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1270,
   "id": "eng-deu-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1814,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080548,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model google/gemini-3.5-flash --target-lang \"German\" --yes"
  },
  {
   "priority": 1271,
   "id": "eng-dan-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1819,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080326,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1272,
   "id": "eng-dan-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1819,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080326,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model google/gemini-3.5-flash --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1273,
   "id": "eng-guj-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1819,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080326,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Gujarati\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1274,
   "id": "eng-que-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.123,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00080255,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model openai/gpt-5.5 --target-lang \"Quechua\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1275,
   "id": "ita-vie-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.123,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00080255,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1276,
   "id": "eng-que-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.123,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00080255,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model openai/gpt-5.5 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1277,
   "id": "ita-vie-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.123,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00080255,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1278,
   "id": "eng-arb-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1822,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080194,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Standard Arabic\" --yes"
  },
  {
   "priority": 1279,
   "id": "eng-lug-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1148,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00080184,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Ganda\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1280,
   "id": "eng-sin-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0798,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00080161,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Sinhala\" --yes"
  },
  {
   "priority": 1281,
   "id": "eng-zsm-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1215,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00080092,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Standard Malay\" --yes"
  },
  {
   "priority": 1282,
   "id": "spa-rus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1826,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080018,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1283,
   "id": "spa-rus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1826,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00080018,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1284,
   "id": "eng-fry-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1214,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00079749,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model openai/gpt-5.5 --target-lang \"Western Frisian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1285,
   "id": "eng-fry-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1214,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00079749,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model openai/gpt-5.5 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1286,
   "id": "spa-eus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1198,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00079228,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1287,
   "id": "eng-xho-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1198,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00079228,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model openai/gpt-5.5 --target-lang \"Xhosa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1288,
   "id": "spa-eus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1198,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00079228,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1289,
   "id": "ita-mlt-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1129,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00079204,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1290,
   "id": "rus-dan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1852,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078895,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1291,
   "id": "rus-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1852,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078895,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1292,
   "id": "rus-dan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1852,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078895,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1293,
   "id": "rus-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1852,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078895,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1294,
   "id": "dan-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1853,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078852,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1295,
   "id": "dan-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1853,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078852,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1296,
   "id": "eng-nld-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1856,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078725,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes"
  },
  {
   "priority": 1297,
   "id": "eng-pan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1041,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00078713,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Panjabi\" --yes"
  },
  {
   "priority": 1298,
   "id": "spa-kor-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1182,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00078694,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1299,
   "id": "spa-kor-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1182,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00078694,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1300,
   "id": "eng-tam-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1245,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00078631,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Tamil\" --yes"
  },
  {
   "priority": 1301,
   "id": "eng-nld-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1862,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078471,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model openai/gpt-5.5 --target-lang \"Dutch\" --yes"
  },
  {
   "priority": 1302,
   "id": "eng-ibo-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0386,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00078356,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Igbo\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1303,
   "id": "fra-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1866,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078303,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1304,
   "id": "spa-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1866,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078303,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1305,
   "id": "fra-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1866,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078303,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1306,
   "id": "spa-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1866,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00078303,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1307,
   "id": "eng-urd-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1135,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.0007827,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Urdu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1308,
   "id": "eng-ilo-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.114,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00078183,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Ilocano\" --yes"
  },
  {
   "priority": 1309,
   "id": "fra-tgl-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1166,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00078169,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model openai/gpt-5.5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1310,
   "id": "eng-amh-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1166,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00078169,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model openai/gpt-5.5 --target-lang \"Amharic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1311,
   "id": "fra-tgl-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1166,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00078169,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model openai/gpt-5.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1312,
   "id": "eng-sme-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0762,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00078081,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Northern Sámi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1313,
   "id": "dan-fao-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1852,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00077948,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1314,
   "id": "eng-guj-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1879,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00077761,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Gujarati\" --yes"
  },
  {
   "priority": 1315,
   "id": "fra-por-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1879,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00077761,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model google/gemini-3.1-pro-preview --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1316,
   "id": "spa-por-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1879,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00077761,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model google/gemini-3.1-pro-preview --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1317,
   "id": "fra-por-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1879,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00077761,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model google/gemini-3.1-pro-preview --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1318,
   "id": "spa-por-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1879,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00077761,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model google/gemini-3.1-pro-preview --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1319,
   "id": "deu-ceb-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.115,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00077631,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model openai/gpt-5.5 --target-lang \"Cebuano\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1320,
   "id": "nld-cat-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.115,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00077631,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1321,
   "id": "deu-ceb-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.115,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00077631,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model openai/gpt-5.5 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1322,
   "id": "nld-cat-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.115,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00077631,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1323,
   "id": "deu-vie-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1885,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00077514,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1324,
   "id": "deu-vie-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1885,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00077514,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1325,
   "id": "eng-tuk-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1885,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00077514,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model openai/gpt-5.5 --target-lang \"Turkmen\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1326,
   "id": "eng-tel-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1134,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00077077,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model openai/gpt-5.5 --target-lang \"Telugu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1327,
   "id": "eng-ilo-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1157,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00077035,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Ilocano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1328,
   "id": "eng-hau-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.129,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00076568,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 1329,
   "id": "eng-ltz-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00076533,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model openai/gpt-5.5 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1330,
   "id": "por-tgl-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00076533,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model openai/gpt-5.5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1331,
   "id": "eng-ltz-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00076533,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model openai/gpt-5.5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1332,
   "id": "por-tgl-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1118,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00076533,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model openai/gpt-5.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1333,
   "id": "eng-xho-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1244,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00076298,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model openai/gpt-5.5 --target-lang \"Xhosa\" --yes"
  },
  {
   "priority": 1334,
   "id": "eng-guj-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1919,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0007614,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model openai/gpt-5.5 --target-lang \"Gujarati\" --yes"
  },
  {
   "priority": 1335,
   "id": "spa-vie-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00076,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1336,
   "id": "spa-vie-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00076,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1337,
   "id": "eng-tuk-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1925,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00075903,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Turkmen\" --yes"
  },
  {
   "priority": 1338,
   "id": "por-cat-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1086,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00075452,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1339,
   "id": "eng-lao-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1086,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00075452,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model openai/gpt-5.5 --target-lang \"Lao\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1340,
   "id": "eng-pan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1086,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00075452,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model openai/gpt-5.5 --target-lang \"Panjabi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1341,
   "id": "por-cat-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1086,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00075452,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1342,
   "id": "eng-sin-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0852,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.0007508,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model openai/gpt-5.5 --target-lang \"Sinhala\" --yes"
  },
  {
   "priority": 1343,
   "id": "eng-spa-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1948,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00075007,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Spanish\" --yes"
  },
  {
   "priority": 1344,
   "id": "jpn-eus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1949,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00074968,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1345,
   "id": "rus-eus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1949,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00074968,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1346,
   "id": "jpn-eus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1949,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00074968,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1347,
   "id": "rus-eus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1949,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00074968,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1348,
   "id": "eng-tel-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1166,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.00074961,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Telugu\" --yes"
  },
  {
   "priority": 1349,
   "id": "cmn-kor-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1951,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00074892,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1350,
   "id": "cmn-kor-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1951,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00074892,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1351,
   "id": "eng-zsm-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1302,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.0007474,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model openai/gpt-5.5 --target-lang \"Standard Malay\" --yes"
  },
  {
   "priority": 1352,
   "id": "eng-tam-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.131,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.0007473,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model openai/gpt-5.5 --target-lang \"Tamil\" --yes"
  },
  {
   "priority": 1353,
   "id": "deu-ltz-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1123,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00074423,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1354,
   "id": "deu-ita-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1965,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00074358,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1355,
   "id": "deu-ita-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1965,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00074358,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1356,
   "id": "por-glg-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.134,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00074147,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1357,
   "id": "por-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1039,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.0007369,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1358,
   "id": "por-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1039,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.0007369,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1359,
   "id": "fra-deu-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1984,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00073646,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model google/gemini-3.1-pro-preview --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1360,
   "id": "fra-deu-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1984,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00073646,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model google/gemini-3.1-pro-preview --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1361,
   "id": "eng-bos-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0624,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00073619,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Bosnian\" --yes"
  },
  {
   "priority": 1362,
   "id": "eng-cym-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0415,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00073514,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Welsh\" --yes"
  },
  {
   "priority": 1363,
   "id": "eng-kaz-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1245,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00073467,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1364,
   "id": "deu-por-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1992,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0007335,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model anthropic/claude-fable-5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1365,
   "id": "eng-fra-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1992,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0007335,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model google/gemini-3.5-flash --target-lang \"French\" --yes"
  },
  {
   "priority": 1366,
   "id": "eng-glg-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1992,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0007335,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model anthropic/claude-fable-5 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1367,
   "id": "spa-ita-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1992,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0007335,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1368,
   "id": "deu-por-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-deu-por-dev",
   "corpus_file": "datasets/curated/deu-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1992,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0007335,
   "run_command": "mt-eval run --corpus tatoeba-deu-por-dev --model anthropic/claude-fable-5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1369,
   "id": "eng-glg-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-eng-glg-dev",
   "corpus_file": "datasets/curated/eng-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1992,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0007335,
   "run_command": "mt-eval run --corpus tatoeba-eng-glg-dev --model anthropic/claude-fable-5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1370,
   "id": "spa-ita-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-spa-ita-dev",
   "corpus_file": "datasets/curated/spa-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 100,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1992,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0007335,
   "run_command": "mt-eval run --corpus tatoeba-spa-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1371,
   "id": "deu-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1997,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00073166,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1372,
   "id": "ita-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1997,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00073166,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1373,
   "id": "por-rus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1997,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00073166,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1374,
   "id": "deu-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1997,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00073166,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1375,
   "id": "ita-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1997,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00073166,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1376,
   "id": "por-rus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1997,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00073166,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1377,
   "id": "eng-mal-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.065,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00072922,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Malayalam\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1378,
   "id": "eng-mal-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0651,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.0007281,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Malayalam\" --yes"
  },
  {
   "priority": 1379,
   "id": "spa-glg-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1952,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00072623,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model anthropic/claude-fable-5 --target-lang \"Galician\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1380,
   "id": "spa-glg-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-spa-glg-dev",
   "corpus_file": "datasets/curated/spa-glg-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 98,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1952,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4851,
   "predicted_effective": 0.2425,
   "expected_mesh_gain": 0.00014176,
   "ecv_per_usd": 0.00072623,
   "run_command": "mt-eval run --corpus tatoeba-spa-glg-dev --model anthropic/claude-fable-5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1381,
   "id": "eng-eus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072621,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1382,
   "id": "jpn-kor-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072621,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1383,
   "id": "nld-rus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072621,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1384,
   "id": "eng-eus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-eng-eus-dev",
   "corpus_file": "datasets/curated/eng-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072621,
   "run_command": "mt-eval run --corpus tatoeba-eng-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1385,
   "id": "jpn-kor-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "jpn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-jpn-kor-dev",
   "corpus_file": "datasets/curated/jpn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072621,
   "run_command": "mt-eval run --corpus tatoeba-jpn-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1386,
   "id": "nld-rus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "nld>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-nld-rus-dev",
   "corpus_file": "datasets/curated/nld-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 101,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072621,
   "run_command": "mt-eval run --corpus tatoeba-nld-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1387,
   "id": "ita-cat-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1007,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00072549,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1388,
   "id": "ita-cat-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1007,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00072549,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model openai/gpt-5.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1389,
   "id": "eng-hil-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0844,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00072537,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model openai/gpt-5.5 --target-lang \"Hiligaynon\" --yes"
  },
  {
   "priority": 1390,
   "id": "eng-ceb-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.119,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00072443,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Cebuano\" --yes"
  },
  {
   "priority": 1391,
   "id": "eng-yor-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0893,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00072288,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Yoruba\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1392,
   "id": "eng-jpn-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.2023,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072226,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes"
  },
  {
   "priority": 1393,
   "id": "rus-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2023,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072226,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1394,
   "id": "rus-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2023,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072226,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1395,
   "id": "nld-fry-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0639,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00072119,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1396,
   "id": "eng-ita-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.2028,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072048,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1397,
   "id": "eng-ita-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2028,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072048,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model google/gemini-3.5-flash --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1398,
   "id": "ita-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2029,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072013,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1399,
   "id": "ita-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2029,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00072013,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1400,
   "id": "eng-fao-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0991,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.0007198,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model openai/gpt-5.5 --target-lang \"Faroese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1401,
   "id": "eng-fao-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0991,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.0007198,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model openai/gpt-5.5 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1402,
   "id": "deu-cat-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2032,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00071906,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1403,
   "id": "deu-cat-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-deu-cat-dev",
   "corpus_file": "datasets/curated/deu-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2032,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00071906,
   "run_command": "mt-eval run --corpus tatoeba-deu-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1404,
   "id": "eng-yor-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0899,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00071805,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model openai/gpt-5.5 --target-lang \"Yoruba\" --yes"
  },
  {
   "priority": 1405,
   "id": "fra-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2036,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00071765,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1406,
   "id": "fra-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2036,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00071765,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1407,
   "id": "spa-tgl-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0975,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00071393,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model openai/gpt-5.5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1408,
   "id": "spa-tgl-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0975,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00071393,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model openai/gpt-5.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1409,
   "id": "eng-rus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.205,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00071275,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1410,
   "id": "eng-rus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.205,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00071275,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model google/gemini-3.5-flash --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1411,
   "id": "nld-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2052,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00071205,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1412,
   "id": "nld-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "nld>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-nld-jpn-dev",
   "corpus_file": "datasets/curated/nld-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 103,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2052,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00071205,
   "run_command": "mt-eval run --corpus tatoeba-nld-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1413,
   "id": "ita-rus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2063,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00070826,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1414,
   "id": "ita-rus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2063,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00070826,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1415,
   "id": "eng-pag-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0959,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00070817,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model openai/gpt-5.5 --target-lang \"Pangasinan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1416,
   "id": "eng-kan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0975,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00070674,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model openai/gpt-5.5 --target-lang \"Kannada\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1417,
   "id": "fra-ltz-dev-v1__anthropic_claude-opus-4.8__coached",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "anthropic/claude-opus-4.8",
   "condition": "coached",
   "est_cost_usd": 0.1229,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00070619,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1418,
   "id": "eng-sin-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0906,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00070605,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Sinhala\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1419,
   "id": "ita-nld-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2072,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00070518,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1420,
   "id": "ita-nld-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "ita>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-ita-nld-dev",
   "corpus_file": "datasets/curated/ita-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 104,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2072,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00070518,
   "run_command": "mt-eval run --corpus tatoeba-ita-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1421,
   "id": "eng-sna-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0617,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00070191,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Shona\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1422,
   "id": "eng-tir-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0688,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00070126,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Tigrinya\" --yes"
  },
  {
   "priority": 1423,
   "id": "eng-mya-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0812,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00070034,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Burmese\" --yes"
  },
  {
   "priority": 1424,
   "id": "eng-pam-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0461,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00069982,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Kapampangan\" --yes"
  },
  {
   "priority": 1425,
   "id": "ita-dan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1813,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00069954,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1426,
   "id": "nld-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1813,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00069954,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1427,
   "id": "ita-dan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "ita>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-ita-dan-dev",
   "corpus_file": "datasets/curated/ita-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1813,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00069954,
   "run_command": "mt-eval run --corpus tatoeba-ita-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1428,
   "id": "nld-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "nld>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-nld-cmn-dev",
   "corpus_file": "datasets/curated/nld-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 91,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1813,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.434,
   "predicted_effective": 0.217,
   "expected_mesh_gain": 0.00012683,
   "ecv_per_usd": 0.00069954,
   "run_command": "mt-eval run --corpus tatoeba-nld-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1429,
   "id": "eng-kaz-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1308,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00069929,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Kazakh\" --yes"
  },
  {
   "priority": 1430,
   "id": "cmn-vie-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2092,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069844,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1431,
   "id": "cmn-vie-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "cmn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-cmn-vie-dev",
   "corpus_file": "datasets/curated/cmn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2092,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069844,
   "run_command": "mt-eval run --corpus tatoeba-cmn-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1432,
   "id": "spa-fra-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2093,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069811,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model openai/gpt-5.5 --target-lang \"French\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1433,
   "id": "spa-fra-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2093,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069811,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model openai/gpt-5.5 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1434,
   "id": "eng-urd-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1276,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00069621,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Urdu\" --yes"
  },
  {
   "priority": 1435,
   "id": "fra-dan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1793,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00069577,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1436,
   "id": "fra-dan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-fra-dan-dev",
   "corpus_file": "datasets/curated/fra-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 90,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1793,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4269,
   "predicted_effective": 0.2135,
   "expected_mesh_gain": 0.00012475,
   "ecv_per_usd": 0.00069577,
   "run_command": "mt-eval run --corpus tatoeba-fra-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1437,
   "id": "deu-eus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069512,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1438,
   "id": "deu-eus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069512,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1439,
   "id": "eng-arb-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2109,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069281,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model openai/gpt-5.5 --target-lang \"Standard Arabic\" --yes"
  },
  {
   "priority": 1440,
   "id": "eng-lao-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1184,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00069206,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Lao\" --yes"
  },
  {
   "priority": 1441,
   "id": "eng-bos-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0664,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00069184,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model openai/gpt-5.5 --target-lang \"Bosnian\" --yes"
  },
  {
   "priority": 1442,
   "id": "rus-vie-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2115,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069084,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1443,
   "id": "rus-vie-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2115,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00069084,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1444,
   "id": "fra-vie-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1753,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00068814,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1445,
   "id": "por-dan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1753,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00068814,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1446,
   "id": "fra-vie-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-fra-vie-dev",
   "corpus_file": "datasets/curated/fra-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1753,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00068814,
   "run_command": "mt-eval run --corpus tatoeba-fra-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1447,
   "id": "por-dan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-por-dan-dev",
   "corpus_file": "datasets/curated/por-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 88,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1753,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.4128,
   "predicted_effective": 0.2064,
   "expected_mesh_gain": 0.00012063,
   "ecv_per_usd": 0.00068814,
   "run_command": "mt-eval run --corpus tatoeba-por-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1448,
   "id": "spa-nld-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2125,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00068759,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1449,
   "id": "spa-nld-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2125,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00068759,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1450,
   "id": "deu-tgl-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2132,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00068533,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1451,
   "id": "deu-tgl-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-deu-tgl-dev",
   "corpus_file": "datasets/curated/deu-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 107,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2132,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00068533,
   "run_command": "mt-eval run --corpus tatoeba-deu-tgl-dev --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1452,
   "id": "nld-ltz-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0895,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00068404,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model openai/gpt-5.5 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1453,
   "id": "eng-hil-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0895,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00068404,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model openai/gpt-5.5 --target-lang \"Hiligaynon\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1454,
   "id": "nld-ltz-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0895,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00068404,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model openai/gpt-5.5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1455,
   "id": "fra-rus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2141,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00068245,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1456,
   "id": "fra-rus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2141,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00068245,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1457,
   "id": "eng-sna-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0635,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00068202,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model openai/gpt-5.5 --target-lang \"Shona\" --yes"
  },
  {
   "priority": 1458,
   "id": "eng-tir-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0709,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00068049,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Tigrinya\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1459,
   "id": "eng-ilo-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1311,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00067986,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Ilocano\" --yes"
  },
  {
   "priority": 1460,
   "id": "eng-kor-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.215,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0006796,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Korean\" --yes"
  },
  {
   "priority": 1461,
   "id": "eng-cat-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2152,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067897,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1462,
   "id": "eng-cat-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-eng-cat-dev",
   "corpus_file": "datasets/curated/eng-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 108,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2152,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067897,
   "run_command": "mt-eval run --corpus tatoeba-eng-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1463,
   "id": "eng-sin-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0944,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00067763,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Sinhala\" --yes"
  },
  {
   "priority": 1464,
   "id": "spa-deu-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2157,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067739,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model openai/gpt-5.5 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1465,
   "id": "spa-deu-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2157,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067739,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model openai/gpt-5.5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1466,
   "id": "eng-tgl-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2159,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067676,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model openai/gpt-5.5 --target-lang \"Filipino\" --yes"
  },
  {
   "priority": 1467,
   "id": "deu-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1693,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00067628,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1468,
   "id": "spa-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1693,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00067628,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1469,
   "id": "deu-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-deu-cmn-dev",
   "corpus_file": "datasets/curated/deu-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1693,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00067628,
   "run_command": "mt-eval run --corpus tatoeba-deu-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1470,
   "id": "spa-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-spa-cmn-dev",
   "corpus_file": "datasets/curated/spa-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 85,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1693,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3918,
   "predicted_effective": 0.1959,
   "expected_mesh_gain": 0.00011449,
   "ecv_per_usd": 0.00067628,
   "run_command": "mt-eval run --corpus tatoeba-spa-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1471,
   "id": "eng-dan-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2168,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067395,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1472,
   "id": "eng-dan-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2168,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067395,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model google/gemini-3.1-pro-preview --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1473,
   "id": "eng-guj-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2168,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067395,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Gujarati\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1474,
   "id": "cmn-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2172,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067271,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1475,
   "id": "cmn-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "cmn>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-cmn-jpn-dev",
   "corpus_file": "datasets/curated/cmn-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 109,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2172,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067271,
   "run_command": "mt-eval run --corpus tatoeba-cmn-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1476,
   "id": "dan-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1673,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00067231,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1477,
   "id": "dan-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "dan>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-dan-jpn-dev",
   "corpus_file": "datasets/curated/dan-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 84,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1673,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3849,
   "predicted_effective": 0.1925,
   "expected_mesh_gain": 0.00011248,
   "ecv_per_usd": 0.00067231,
   "run_command": "mt-eval run --corpus tatoeba-dan-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1478,
   "id": "eng-tgl-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.2174,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00067209,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes"
  },
  {
   "priority": 1479,
   "id": "spa-que-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1518,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00067031,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model openai/gpt-5.5 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1480,
   "id": "eng-mya-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00066982,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Burmese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1481,
   "id": "rus-kor-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.2182,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066963,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1482,
   "id": "rus-kor-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2182,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066963,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model google/gemini-3.5-flash --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1483,
   "id": "eng-war-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1407,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00066878,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Waray\" --yes"
  },
  {
   "priority": 1484,
   "id": "eng-tuk-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2187,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0006681,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model openai/gpt-5.5 --target-lang \"Turkmen\" --yes"
  },
  {
   "priority": 1485,
   "id": "spa-cat-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1654,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00066802,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1486,
   "id": "spa-cat-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-spa-cat-dev",
   "corpus_file": "datasets/curated/spa-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 83,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1654,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3781,
   "predicted_effective": 0.189,
   "expected_mesh_gain": 0.00011049,
   "ecv_per_usd": 0.00066802,
   "run_command": "mt-eval run --corpus tatoeba-spa-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1487,
   "id": "fra-nld-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2189,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066749,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1488,
   "id": "fra-nld-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2189,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066749,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model openai/gpt-5.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1489,
   "id": "eng-por-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2203,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066325,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Portuguese (Brazilian)\" --yes"
  },
  {
   "priority": 1490,
   "id": "jpn-vie-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1491,
   "id": "rus-kaz-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model google/gemini-3.5-flash --target-lang \"Kazakh\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1492,
   "id": "eng-arb-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Standard Arabic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1493,
   "id": "eng-cmn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1494,
   "id": "eng-deu-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model google/gemini-3.5-flash --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1495,
   "id": "eng-fra-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model google/gemini-3.5-flash --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1496,
   "id": "eng-jpn-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1497,
   "id": "eng-kor-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1498,
   "id": "eng-nld-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1499,
   "id": "eng-por-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1500,
   "id": "eng-spa-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Spanish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1501,
   "id": "eng-tgl-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1502,
   "id": "eng-tha-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Thai\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1503,
   "id": "eng-vie-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1504,
   "id": "jpn-vie-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model google/gemini-3.5-flash --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1505,
   "id": "rus-kaz-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2204,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066295,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model google/gemini-3.5-flash --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1506,
   "id": "fra-ita-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2205,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066265,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1507,
   "id": "fra-ita-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2205,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066265,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1508,
   "id": "rus-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2207,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066205,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1509,
   "id": "rus-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2207,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066205,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1510,
   "id": "eng-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2208,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066175,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes"
  },
  {
   "priority": 1511,
   "id": "eng-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2208,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066175,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model openai/gpt-5.5 --target-lang \"Japanese\" --yes"
  },
  {
   "priority": 1512,
   "id": "spa-dan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2211,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066085,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1513,
   "id": "spa-dan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-spa-dan-dev",
   "corpus_file": "datasets/curated/spa-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 111,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2211,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00066085,
   "run_command": "mt-eval run --corpus tatoeba-spa-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1514,
   "id": "fra-kor-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1614,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00065995,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1515,
   "id": "fra-kor-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-fra-kor-dev",
   "corpus_file": "datasets/curated/fra-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 81,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1614,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3645,
   "predicted_effective": 0.1822,
   "expected_mesh_gain": 0.00010652,
   "ecv_per_usd": 0.00065995,
   "run_command": "mt-eval run --corpus tatoeba-fra-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1516,
   "id": "eng-zul-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2219,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00065847,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model openai/gpt-5.5 --target-lang \"Zulu\" --yes"
  },
  {
   "priority": 1517,
   "id": "spa-rus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2221,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00065787,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1518,
   "id": "spa-rus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2221,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00065787,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1519,
   "id": "eng-mlt-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1422,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00065761,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1520,
   "id": "eng-ibo-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.046,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00065751,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Igbo\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1521,
   "id": "eng-guj-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2225,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00065669,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Gujarati\" --yes"
  },
  {
   "priority": 1522,
   "id": "deu-kor-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1594,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00065595,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1523,
   "id": "nld-dan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1594,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00065595,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1524,
   "id": "por-ita-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1594,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00065595,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1525,
   "id": "por-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1594,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00065595,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1526,
   "id": "deu-kor-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-deu-kor-dev",
   "corpus_file": "datasets/curated/deu-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1594,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00065595,
   "run_command": "mt-eval run --corpus tatoeba-deu-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1527,
   "id": "nld-dan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "nld>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-nld-dan-dev",
   "corpus_file": "datasets/curated/nld-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1594,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00065595,
   "run_command": "mt-eval run --corpus tatoeba-nld-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1528,
   "id": "por-ita-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-por-ita-dev",
   "corpus_file": "datasets/curated/por-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1594,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00065595,
   "run_command": "mt-eval run --corpus tatoeba-por-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1529,
   "id": "por-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-por-jpn-dev",
   "corpus_file": "datasets/curated/por-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 80,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1594,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3578,
   "predicted_effective": 0.1789,
   "expected_mesh_gain": 0.00010456,
   "ecv_per_usd": 0.00065595,
   "run_command": "mt-eval run --corpus tatoeba-por-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1530,
   "id": "por-glg-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1515,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00065582,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Galician\" --yes"
  },
  {
   "priority": 1531,
   "id": "deu-nld-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2231,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00065492,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1532,
   "id": "deu-nld-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-deu-nld-dev",
   "corpus_file": "datasets/curated/deu-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2231,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00065492,
   "run_command": "mt-eval run --corpus tatoeba-deu-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1533,
   "id": "eng-zul-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2231,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00065492,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Zulu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1534,
   "id": "eng-vie-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2233,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00065434,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes"
  },
  {
   "priority": 1535,
   "id": "dan-fao-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2207,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.0006541,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1536,
   "id": "por-nld-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1574,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00065185,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1537,
   "id": "por-nld-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-por-nld-dev",
   "corpus_file": "datasets/curated/por-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 79,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1574,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3511,
   "predicted_effective": 0.1756,
   "expected_mesh_gain": 0.0001026,
   "ecv_per_usd": 0.00065185,
   "run_command": "mt-eval run --corpus tatoeba-por-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1538,
   "id": "eng-war-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1444,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00065164,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Waray\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1539,
   "id": "eng-bos-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0705,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.0006516,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Bosnian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1540,
   "id": "eng-mya-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0876,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00064917,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Burmese\" --yes"
  },
  {
   "priority": 1541,
   "id": "deu-dan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2251,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0006491,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1542,
   "id": "deu-rus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2251,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0006491,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1543,
   "id": "deu-dan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-deu-dan-dev",
   "corpus_file": "datasets/curated/deu-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2251,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0006491,
   "run_command": "mt-eval run --corpus tatoeba-deu-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1544,
   "id": "deu-rus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-deu-rus-dev",
   "corpus_file": "datasets/curated/deu-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2251,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0006491,
   "run_command": "mt-eval run --corpus tatoeba-deu-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1545,
   "id": "rus-dan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00064853,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1546,
   "id": "rus-dan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00064853,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1547,
   "id": "eng-ilo-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1379,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00064633,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Ilocano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1548,
   "id": "eng-cmn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2269,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00064395,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes"
  },
  {
   "priority": 1549,
   "id": "fra-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00064395,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1550,
   "id": "spa-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00064395,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1551,
   "id": "fra-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00064395,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1552,
   "id": "spa-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00064395,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1553,
   "id": "eng-que-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1534,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00064351,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model anthropic/claude-fable-5 --target-lang \"Quechua\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1554,
   "id": "ita-vie-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1534,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00064351,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1555,
   "id": "eng-que-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-eng-que-dev",
   "corpus_file": "datasets/curated/eng-que-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1534,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00064351,
   "run_command": "mt-eval run --corpus tatoeba-eng-que-dev --model anthropic/claude-fable-5 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1556,
   "id": "ita-vie-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "ita>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-ita-vie-dev",
   "corpus_file": "datasets/curated/ita-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1534,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3378,
   "predicted_effective": 0.1689,
   "expected_mesh_gain": 9.871e-05,
   "ecv_per_usd": 0.00064351,
   "run_command": "mt-eval run --corpus tatoeba-ita-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1557,
   "id": "eng-hau-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1535,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00064347,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 1558,
   "id": "eng-ceb-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1342,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00064238,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Cebuano\" --yes"
  },
  {
   "priority": 1559,
   "id": "eng-tha-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2276,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00064197,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Thai\" --yes"
  },
  {
   "priority": 1560,
   "id": "eng-sme-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0927,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00064183,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model openai/gpt-5.5 --target-lang \"Northern Sámi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1561,
   "id": "eng-yor-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1007,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00064104,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Yoruba\" --yes"
  },
  {
   "priority": 1562,
   "id": "eng-uzb-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1442,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00064039,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 1563,
   "id": "eng-hau-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1543,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00064013,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1564,
   "id": "eng-fry-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1514,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00063946,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model anthropic/claude-fable-5 --target-lang \"Western Frisian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1565,
   "id": "eng-fry-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-eng-fry-dev",
   "corpus_file": "datasets/curated/eng-fry-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 76,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1514,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3313,
   "predicted_effective": 0.1656,
   "expected_mesh_gain": 9.681e-05,
   "ecv_per_usd": 0.00063946,
   "run_command": "mt-eval run --corpus tatoeba-eng-fry-dev --model anthropic/claude-fable-5 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1566,
   "id": "fra-por-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2285,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00063945,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model openai/gpt-5.5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1567,
   "id": "spa-por-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2285,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00063945,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model openai/gpt-5.5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1568,
   "id": "fra-por-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2285,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00063945,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model openai/gpt-5.5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1569,
   "id": "spa-por-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2285,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00063945,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model openai/gpt-5.5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1570,
   "id": "eng-mon-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1308,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00063897,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Mongolian\" --yes"
  },
  {
   "priority": 1571,
   "id": "spa-eus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1494,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00063531,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1572,
   "id": "eng-xho-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1494,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00063531,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Xhosa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1573,
   "id": "spa-eus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-spa-eus-dev",
   "corpus_file": "datasets/curated/spa-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1494,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00063531,
   "run_command": "mt-eval run --corpus tatoeba-spa-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1574,
   "id": "dan-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2311,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00063225,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1575,
   "id": "dan-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "dan>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-dan-cmn-dev",
   "corpus_file": "datasets/curated/dan-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 116,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2311,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00063225,
   "run_command": "mt-eval run --corpus tatoeba-dan-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1576,
   "id": "spa-kor-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1474,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00063104,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1577,
   "id": "spa-kor-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-spa-kor-dev",
   "corpus_file": "datasets/curated/spa-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 74,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1474,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3183,
   "predicted_effective": 0.1592,
   "expected_mesh_gain": 9.302e-05,
   "ecv_per_usd": 0.00063104,
   "run_command": "mt-eval run --corpus tatoeba-spa-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1578,
   "id": "eng-uzb-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1465,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00063033,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 1579,
   "id": "fra-cat-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.0469,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00062994,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes"
  },
  {
   "priority": 1580,
   "id": "eng-pan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1302,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00062934,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model openai/gpt-5.5 --target-lang \"Panjabi\" --yes"
  },
  {
   "priority": 1581,
   "id": "eng-zsm-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1548,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00062863,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Standard Malay\" --yes"
  },
  {
   "priority": 1582,
   "id": "cmn-kor-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2325,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00062844,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1583,
   "id": "cmn-kor-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2325,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00062844,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1584,
   "id": "fra-tgl-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1454,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00062686,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1585,
   "id": "eng-amh-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1454,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00062686,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Amharic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1586,
   "id": "fra-tgl-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-fra-tgl-dev",
   "corpus_file": "datasets/curated/fra-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1454,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00062686,
   "run_command": "mt-eval run --corpus tatoeba-fra-tgl-dev --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1587,
   "id": "dan-fao-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2304,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00062656,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Faroese\" --yes"
  },
  {
   "priority": 1588,
   "id": "eng-pam-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0516,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00062523,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Kapampangan\" --yes"
  },
  {
   "priority": 1589,
   "id": "rus-uzb-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0562,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00062501,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1590,
   "id": "deu-ceb-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1434,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00062256,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model anthropic/claude-fable-5 --target-lang \"Cebuano\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1591,
   "id": "nld-cat-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1434,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00062256,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1592,
   "id": "deu-ceb-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-deu-ceb-dev",
   "corpus_file": "datasets/curated/deu-ceb-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1434,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00062256,
   "run_command": "mt-eval run --corpus tatoeba-deu-ceb-dev --model anthropic/claude-fable-5 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1593,
   "id": "nld-cat-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "nld>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-nld-cat-dev",
   "corpus_file": "datasets/curated/nld-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 72,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1434,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3055,
   "predicted_effective": 0.1527,
   "expected_mesh_gain": 8.928e-05,
   "ecv_per_usd": 0.00062256,
   "run_command": "mt-eval run --corpus tatoeba-nld-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1594,
   "id": "dan-fao-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.2319,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00062251,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Faroese\" --yes"
  },
  {
   "priority": 1595,
   "id": "deu-vie-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2351,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00062149,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1596,
   "id": "deu-vie-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-deu-vie-dev",
   "corpus_file": "datasets/curated/deu-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2351,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00062149,
   "run_command": "mt-eval run --corpus tatoeba-deu-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1597,
   "id": "eng-tuk-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2351,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00062149,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Turkmen\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1598,
   "id": "por-glg-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1599,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00062137,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Galician\" --yes"
  },
  {
   "priority": 1599,
   "id": "nld-fry-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0744,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00061941,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Western Frisian\" --yes"
  },
  {
   "priority": 1600,
   "id": "eng-tel-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>tel",
   "target_language": "Telugu",
   "corpus_id": "tatoeba-eng-tel-dev",
   "corpus_file": "datasets/curated/eng-tel-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 71,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1415,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2991,
   "predicted_effective": 0.1495,
   "expected_mesh_gain": 8.741e-05,
   "ecv_per_usd": 0.0006177,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tel-dev-v1.json && mt-eval run --corpus eng-tel-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Telugu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1601,
   "id": "eng-ibo-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0491,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.000616,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Igbo\" --yes"
  },
  {
   "priority": 1602,
   "id": "eng-kaz-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1485,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00061594,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1603,
   "id": "spa-que-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1653,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00061557,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Quechua\" --yes"
  },
  {
   "priority": 1604,
   "id": "eng-sna-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0705,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.0006143,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Shona\" --yes"
  },
  {
   "priority": 1605,
   "id": "eng-sme-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0969,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00061401,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Northern Sámi\" --yes"
  },
  {
   "priority": 1606,
   "id": "eng-ltz-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1395,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00061336,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model anthropic/claude-fable-5 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1607,
   "id": "por-tgl-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1395,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00061336,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1608,
   "id": "eng-ltz-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-eng-ltz-dev",
   "corpus_file": "datasets/curated/eng-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1395,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00061336,
   "run_command": "mt-eval run --corpus tatoeba-eng-ltz-dev --model anthropic/claude-fable-5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1609,
   "id": "por-tgl-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-por-tgl-dev",
   "corpus_file": "datasets/curated/por-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 70,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1395,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2928,
   "predicted_effective": 0.1464,
   "expected_mesh_gain": 8.556e-05,
   "ecv_per_usd": 0.00061336,
   "run_command": "mt-eval run --corpus tatoeba-por-tgl-dev --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1610,
   "id": "eng-mal-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0775,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.0006116,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Malayalam\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1611,
   "id": "eng-pam-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0529,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00060986,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Kapampangan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1612,
   "id": "por-glg-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00060955,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model openai/gpt-5.5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1613,
   "id": "spa-vie-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1375,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00060911,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1614,
   "id": "spa-vie-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-spa-vie-dev",
   "corpus_file": "datasets/curated/spa-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1375,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2866,
   "predicted_effective": 0.1433,
   "expected_mesh_gain": 8.375e-05,
   "ecv_per_usd": 0.00060911,
   "run_command": "mt-eval run --corpus tatoeba-spa-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1615,
   "id": "eng-war-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1545,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00060904,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Waray\" --yes"
  },
  {
   "priority": 1616,
   "id": "eng-deu-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2413,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060553,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model openai/gpt-5.5 --target-lang \"German\" --yes"
  },
  {
   "priority": 1617,
   "id": "fra-deu-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2413,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060553,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model openai/gpt-5.5 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1618,
   "id": "fra-deu-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2413,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060553,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model openai/gpt-5.5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1619,
   "id": "nld-fry-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0762,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00060478,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1620,
   "id": "por-cat-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1355,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00060473,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1621,
   "id": "eng-lao-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1355,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00060473,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Lao\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1622,
   "id": "eng-pan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1355,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00060473,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Panjabi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1623,
   "id": "por-cat-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-por-cat-dev",
   "corpus_file": "datasets/curated/por-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1355,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00060473,
   "run_command": "mt-eval run --corpus tatoeba-por-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1624,
   "id": "eng-ita-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2417,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060452,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1625,
   "id": "eng-ita-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2417,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060452,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model google/gemini-3.1-pro-preview --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1626,
   "id": "jpn-eus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2431,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060104,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1627,
   "id": "rus-eus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2431,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060104,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1628,
   "id": "jpn-eus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "jpn>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-jpn-eus-dev",
   "corpus_file": "datasets/curated/jpn-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2431,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060104,
   "run_command": "mt-eval run --corpus tatoeba-jpn-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1629,
   "id": "rus-eus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "rus>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-rus-eus-dev",
   "corpus_file": "datasets/curated/rus-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 122,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2431,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00060104,
   "run_command": "mt-eval run --corpus tatoeba-rus-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1630,
   "id": "eng-pan-dev-v1__anthropic_claude-opus-4.8__naive",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-opus-4.8",
   "condition": "naive",
   "est_cost_usd": 0.1367,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00059942,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model anthropic/claude-opus-4.8 --target-lang \"Panjabi\" --yes"
  },
  {
   "priority": 1631,
   "id": "eng-rus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2444,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059785,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1632,
   "id": "eng-rus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2444,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059785,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model google/gemini-3.1-pro-preview --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1633,
   "id": "eng-zsm-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1628,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00059774,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Standard Malay\" --yes"
  },
  {
   "priority": 1634,
   "id": "eng-zsm-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1631,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00059664,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Standard Malay\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1635,
   "id": "deu-ita-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.245,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059638,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1636,
   "id": "deu-ita-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-deu-ita-dev",
   "corpus_file": "datasets/curated/deu-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 123,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.245,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059638,
   "run_command": "mt-eval run --corpus tatoeba-deu-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1637,
   "id": "eng-yor-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1086,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00059441,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model openai/gpt-5.5 --target-lang \"Yoruba\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1638,
   "id": "rus-cmn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.246,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059396,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1639,
   "id": "rus-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.246,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059396,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1640,
   "id": "eng-ceb-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1455,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00059249,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1641,
   "id": "eng-lao-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1385,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00059163,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Lao\" --yes"
  },
  {
   "priority": 1642,
   "id": "eng-kor-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.247,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059155,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes"
  },
  {
   "priority": 1643,
   "id": "por-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1295,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00059122,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1644,
   "id": "por-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-por-cmn-dev",
   "corpus_file": "datasets/curated/por-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 65,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1295,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.262,
   "predicted_effective": 0.131,
   "expected_mesh_gain": 7.656e-05,
   "ecv_per_usd": 0.00059122,
   "run_command": "mt-eval run --corpus tatoeba-por-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1645,
   "id": "eng-lao-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1387,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00059077,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Lao\" --yes"
  },
  {
   "priority": 1646,
   "id": "fra-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2476,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059012,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1647,
   "id": "fra-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2476,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00059012,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1648,
   "id": "eng-cym-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0518,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00058897,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Welsh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1649,
   "id": "eng-guj-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2488,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00058727,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Gujarati\" --yes"
  },
  {
   "priority": 1650,
   "id": "eng-tam-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1667,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00058726,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Tamil\" --yes"
  },
  {
   "priority": 1651,
   "id": "deu-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.249,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005868,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1652,
   "id": "ita-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.249,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005868,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1653,
   "id": "por-rus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.249,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005868,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1654,
   "id": "deu-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-deu-jpn-dev",
   "corpus_file": "datasets/curated/deu-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.249,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005868,
   "run_command": "mt-eval run --corpus tatoeba-deu-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1655,
   "id": "ita-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "ita>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-ita-jpn-dev",
   "corpus_file": "datasets/curated/ita-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.249,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005868,
   "run_command": "mt-eval run --corpus tatoeba-ita-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1656,
   "id": "por-rus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-por-rus-dev",
   "corpus_file": "datasets/curated/por-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 125,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.249,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005868,
   "run_command": "mt-eval run --corpus tatoeba-por-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1657,
   "id": "eng-mya-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0975,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00058325,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model openai/gpt-5.5 --target-lang \"Burmese\" --yes"
  },
  {
   "priority": 1658,
   "id": "ita-rus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2508,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00058259,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1659,
   "id": "ita-rus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2508,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00058259,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1660,
   "id": "ita-cat-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1255,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00058213,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1661,
   "id": "ita-cat-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "ita>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-ita-cat-dev",
   "corpus_file": "datasets/curated/ita-cat-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 63,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1255,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.25,
   "predicted_effective": 0.125,
   "expected_mesh_gain": 7.306e-05,
   "ecv_per_usd": 0.00058213,
   "run_command": "mt-eval run --corpus tatoeba-ita-cat-dev --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1662,
   "id": "eng-lug-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1584,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00058113,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Ganda\" --yes"
  },
  {
   "priority": 1663,
   "id": "eng-tam-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1686,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00058064,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Tamil\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1664,
   "id": "eng-sin-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1102,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00058048,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model openai/gpt-5.5 --target-lang \"Sinhala\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1665,
   "id": "por-glg-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1713,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00058002,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model openai/gpt-5.5 --target-lang \"Galician\" --yes"
  },
  {
   "priority": 1666,
   "id": "fra-hau-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1581,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00057854,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 1667,
   "id": "eng-fao-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1235,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00057759,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model anthropic/claude-fable-5 --target-lang \"Faroese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1668,
   "id": "eng-fao-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-eng-fao-dev",
   "corpus_file": "datasets/curated/eng-fao-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 62,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1235,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2441,
   "predicted_effective": 0.1221,
   "expected_mesh_gain": 7.133e-05,
   "ecv_per_usd": 0.00057759,
   "run_command": "mt-eval run --corpus tatoeba-eng-fao-dev --model anthropic/claude-fable-5 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1669,
   "id": "ita-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00057752,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1670,
   "id": "ita-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "ita>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-ita-cmn-dev",
   "corpus_file": "datasets/curated/ita-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 127,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.253,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00057752,
   "run_command": "mt-eval run --corpus tatoeba-ita-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1671,
   "id": "eng-sna-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0751,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00057667,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model openai/gpt-5.5 --target-lang \"Shona\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1672,
   "id": "rus-uzb-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0612,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00057395,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 1673,
   "id": "eng-spa-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2549,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00057322,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Spanish\" --yes"
  },
  {
   "priority": 1674,
   "id": "spa-tgl-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1215,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00057291,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1675,
   "id": "spa-tgl-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-spa-tgl-dev",
   "corpus_file": "datasets/curated/spa-tgl-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1215,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2382,
   "predicted_effective": 0.1191,
   "expected_mesh_gain": 6.961e-05,
   "ecv_per_usd": 0.00057291,
   "run_command": "mt-eval run --corpus tatoeba-spa-tgl-dev --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1676,
   "id": "eng-urd-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1552,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.0005724,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model openai/gpt-5.5 --target-lang \"Urdu\" --yes"
  },
  {
   "priority": 1677,
   "id": "deu-eus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2556,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00057165,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1678,
   "id": "eng-nld-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2556,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00057165,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes"
  },
  {
   "priority": 1679,
   "id": "deu-eus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2556,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00057165,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model openai/gpt-5.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1680,
   "id": "spa-que-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1788,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00056909,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model openai/gpt-5.5 --target-lang \"Quechua\" --yes"
  },
  {
   "priority": 1681,
   "id": "eng-mlt-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1645,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00056847,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 1682,
   "id": "eng-pag-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00056831,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Pangasinan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1683,
   "id": "eng-mon-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1471,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00056816,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model openai/gpt-5.5 --target-lang \"Mongolian\" --yes"
  },
  {
   "priority": 1684,
   "id": "rus-vie-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2572,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00056809,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1685,
   "id": "rus-vie-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2572,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00056809,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1686,
   "id": "eng-kan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1215,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00056714,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Kannada\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1687,
   "id": "eng-deu-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2585,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00056524,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"German\" --yes"
  },
  {
   "priority": 1688,
   "id": "fra-eus-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0587,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00056404,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Basque\" --yes"
  },
  {
   "priority": 1689,
   "id": "eng-mya-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1012,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00056193,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Burmese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1690,
   "id": "rus-kor-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2601,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00056176,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1691,
   "id": "rus-kor-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2601,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00056176,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1692,
   "id": "fra-rus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2604,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00056111,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1693,
   "id": "fra-rus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2604,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00056111,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1694,
   "id": "eng-uzb-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1649,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00056,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model openai/gpt-5.5 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 1695,
   "id": "spa-fra-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.261,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055982,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model anthropic/claude-fable-5 --target-lang \"French\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1696,
   "id": "spa-fra-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-spa-fra-dev",
   "corpus_file": "datasets/curated/spa-fra-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.261,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055982,
   "run_command": "mt-eval run --corpus tatoeba-spa-fra-dev --model anthropic/claude-fable-5 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1697,
   "id": "eng-tir-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0863,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00055906,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model openai/gpt-5.5 --target-lang \"Tigrinya\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1698,
   "id": "eng-por-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2619,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005579,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Portuguese (Brazilian)\" --yes"
  },
  {
   "priority": 1699,
   "id": "rus-uzb-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0631,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00055667,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 1700,
   "id": "jpn-vie-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1701,
   "id": "rus-kaz-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model google/gemini-3.1-pro-preview --target-lang \"Kazakh\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1702,
   "id": "eng-arb-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Standard Arabic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1703,
   "id": "eng-cmn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1704,
   "id": "eng-deu-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1705,
   "id": "eng-fra-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1706,
   "id": "eng-jpn-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1707,
   "id": "eng-kor-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1708,
   "id": "eng-nld-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1709,
   "id": "eng-por-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1710,
   "id": "eng-spa-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Spanish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1711,
   "id": "eng-tgl-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1712,
   "id": "eng-tha-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Thai\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1713,
   "id": "eng-vie-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1714,
   "id": "jpn-vie-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model google/gemini-3.1-pro-preview --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1715,
   "id": "rus-kaz-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055599,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model google/gemini-3.1-pro-preview --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1716,
   "id": "eng-dan-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2636,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005543,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1717,
   "id": "eng-dan-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2636,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005543,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model openai/gpt-5.5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1718,
   "id": "eng-guj-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2636,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.0005543,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model openai/gpt-5.5 --target-lang \"Gujarati\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1719,
   "id": "eng-mlt-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1695,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.0005517,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1720,
   "id": "spa-nld-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.265,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055137,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1721,
   "id": "spa-nld-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-spa-nld-dev",
   "corpus_file": "datasets/curated/spa-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 133,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.265,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055137,
   "run_command": "mt-eval run --corpus tatoeba-spa-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1722,
   "id": "eng-tgl-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2651,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00055116,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Filipino\" --yes"
  },
  {
   "priority": 1723,
   "id": "eng-mon-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1521,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00054949,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Mongolian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1724,
   "id": "nld-ltz-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1116,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00054858,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model anthropic/claude-fable-5 --target-lang \"Luxembourgish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1725,
   "id": "eng-hil-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1116,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00054858,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Hiligaynon\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1726,
   "id": "nld-ltz-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "nld>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-nld-ltz-dev",
   "corpus_file": "datasets/curated/nld-ltz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1116,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.00054858,
   "run_command": "mt-eval run --corpus tatoeba-nld-ltz-dev --model anthropic/claude-fable-5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1727,
   "id": "eng-ibo-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.0553,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00054693,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Igbo\" --yes"
  },
  {
   "priority": 1728,
   "id": "eng-war-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1721,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00054676,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Waray\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1729,
   "id": "eng-bos-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0841,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00054623,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Bosnian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1730,
   "id": "rus-jpn-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2684,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00054439,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1731,
   "id": "rus-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2684,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00054439,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1732,
   "id": "spa-deu-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00054317,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model anthropic/claude-fable-5 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1733,
   "id": "spa-deu-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-spa-deu-dev",
   "corpus_file": "datasets/curated/spa-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 135,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.269,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00054317,
   "run_command": "mt-eval run --corpus tatoeba-spa-deu-dev --model anthropic/claude-fable-5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1734,
   "id": "eng-haw-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.2119,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00054198,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Hawaiian\" --yes"
  },
  {
   "priority": 1735,
   "id": "eng-ibo-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0559,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00054106,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model openai/gpt-5.5 --target-lang \"Igbo\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1736,
   "id": "eng-arb-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2712,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00053877,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Standard Arabic\" --yes"
  },
  {
   "priority": 1737,
   "id": "dan-fao-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2684,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00053785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model openai/gpt-5.5 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1738,
   "id": "spa-que-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1893,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00053752,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Quechua\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1739,
   "id": "eng-haw-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2138,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00053716,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Hawaiian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1740,
   "id": "eng-hau-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1839,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.0005371,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1741,
   "id": "fra-nld-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2729,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00053541,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1742,
   "id": "fra-nld-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-fra-nld-dev",
   "corpus_file": "datasets/curated/fra-nld-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 137,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2729,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00053541,
   "run_command": "mt-eval run --corpus tatoeba-fra-nld-dev --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1743,
   "id": "eng-ceb-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1619,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00053247,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model openai/gpt-5.5 --target-lang \"Cebuano\" --yes"
  },
  {
   "priority": 1744,
   "id": "fra-ita-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00053151,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1745,
   "id": "fra-ita-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-fra-ita-dev",
   "corpus_file": "datasets/curated/fra-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00053151,
   "run_command": "mt-eval run --corpus tatoeba-fra-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1746,
   "id": "eng-sme-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.112,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00053123,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Northern Sámi\" --yes"
  },
  {
   "priority": 1747,
   "id": "eng-ilo-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1678,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00053116,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model openai/gpt-5.5 --target-lang \"Ilocano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1748,
   "id": "eng-mya-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1076,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00052851,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Burmese\" --yes"
  },
  {
   "priority": 1749,
   "id": "spa-rus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2769,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00052768,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1750,
   "id": "spa-rus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-spa-rus-dev",
   "corpus_file": "datasets/curated/spa-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 139,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2769,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00052768,
   "run_command": "mt-eval run --corpus tatoeba-spa-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1751,
   "id": "eng-spa-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2784,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00052483,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Spanish\" --yes"
  },
  {
   "priority": 1752,
   "id": "rus-uzb-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.067,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00052426,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1753,
   "id": "eng-sin-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1222,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00052347,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Sinhala\" --yes"
  },
  {
   "priority": 1754,
   "id": "eng-arb-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2802,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00052146,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Standard Arabic\" --yes"
  },
  {
   "priority": 1755,
   "id": "rus-dan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2809,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00052016,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1756,
   "id": "rus-dan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "rus>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-rus-dan-dev",
   "corpus_file": "datasets/curated/rus-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 141,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2809,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00052016,
   "run_command": "mt-eval run --corpus tatoeba-rus-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1757,
   "id": "cmn-kor-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2828,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051667,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1758,
   "id": "cmn-kor-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2828,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051667,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1759,
   "id": "fra-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2829,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051648,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1760,
   "id": "spa-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2829,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051648,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1761,
   "id": "fra-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-fra-cmn-dev",
   "corpus_file": "datasets/curated/fra-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2829,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051648,
   "run_command": "mt-eval run --corpus tatoeba-fra-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1762,
   "id": "spa-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-spa-jpn-dev",
   "corpus_file": "datasets/curated/spa-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 142,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2829,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051648,
   "run_command": "mt-eval run --corpus tatoeba-spa-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1763,
   "id": "eng-sme-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1156,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00051468,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Northern Sámi\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1764,
   "id": "eng-tgl-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2848,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051304,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes"
  },
  {
   "priority": 1765,
   "id": "fra-por-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051286,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model anthropic/claude-fable-5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1766,
   "id": "spa-por-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051286,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model anthropic/claude-fable-5 --target-lang \"Portuguese (Brazilian)\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1767,
   "id": "fra-por-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-fra-por-dev",
   "corpus_file": "datasets/curated/fra-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051286,
   "run_command": "mt-eval run --corpus tatoeba-fra-por-dev --model anthropic/claude-fable-5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1768,
   "id": "spa-por-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "spa>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-spa-por-dev",
   "corpus_file": "datasets/curated/spa-por-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 143,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2849,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00051286,
   "run_command": "mt-eval run --corpus tatoeba-spa-por-dev --model anthropic/claude-fable-5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1769,
   "id": "eng-mon-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.163,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00051274,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Mongolian\" --yes"
  },
  {
   "priority": 1770,
   "id": "eng-uzb-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1806,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00051132,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 1771,
   "id": "eng-pam-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0631,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00051128,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Kapampangan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1772,
   "id": "fra-cat-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.0578,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00051114,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes"
  },
  {
   "priority": 1773,
   "id": "fra-eus-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.065,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00050937,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1774,
   "id": "eng-kaz-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1805,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00050674,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model openai/gpt-5.5 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1775,
   "id": "eng-kor-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2886,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00050628,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes"
  },
  {
   "priority": 1776,
   "id": "eng-lug-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1824,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00050467,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Ganda\" --yes"
  },
  {
   "priority": 1777,
   "id": "eng-mal-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0943,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00050264,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model openai/gpt-5.5 --target-lang \"Malayalam\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1778,
   "id": "eng-uzb-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1841,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.0005016,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1779,
   "id": "eng-fra-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2914,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00050142,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"French\" --yes"
  },
  {
   "priority": 1780,
   "id": "eng-zsm-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1944,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00050057,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Standard Malay\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1781,
   "id": "eng-tir-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0965,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00049997,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Tigrinya\" --yes"
  },
  {
   "priority": 1782,
   "id": "eng-urd-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.1783,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00049824,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Urdu\" --yes"
  },
  {
   "priority": 1783,
   "id": "eng-ceb-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1734,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00049716,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1784,
   "id": "nld-fry-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0927,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00049713,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model openai/gpt-5.5 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1785,
   "id": "eng-ita-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.294,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00049698,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1786,
   "id": "eng-ita-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.294,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00049698,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model openai/gpt-5.5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1787,
   "id": "nld-fry-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0932,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00049447,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model openai/gpt-5.5 --target-lang \"Western Frisian\" --yes"
  },
  {
   "priority": 1788,
   "id": "eng-cym-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0617,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00049446,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Welsh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1789,
   "id": "eng-hau-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1998,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00049436,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model openai/gpt-5.5 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 1790,
   "id": "fra-hau-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1852,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00049388,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1791,
   "id": "eng-fra-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2963,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00049313,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"French\" --yes"
  },
  {
   "priority": 1792,
   "id": "eng-rus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2972,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00049163,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1793,
   "id": "eng-rus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2972,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00049163,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model openai/gpt-5.5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1794,
   "id": "eng-cym-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.0621,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00049128,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Welsh\" --yes"
  },
  {
   "priority": 1795,
   "id": "eng-jpn-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2982,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00048998,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Japanese\" --yes"
  },
  {
   "priority": 1796,
   "id": "por-glg-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "por>glg",
   "target_language": "Galician",
   "corpus_id": "tatoeba-por-glg-dev",
   "corpus_file": "datasets/curated/por-glg-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 102,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2032,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.34,
   "predicted_effective": 0.17,
   "expected_mesh_gain": 9.936e-05,
   "ecv_per_usd": 0.00048896,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/por-glg-dev-v1.json && mt-eval run --corpus por-glg-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Galician\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1797,
   "id": "eng-ibo-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.062,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00048783,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Igbo\" --yes"
  },
  {
   "priority": 1798,
   "id": "eng-tam-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.201,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00048704,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Tamil\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1799,
   "id": "fra-deu-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3008,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00048575,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model anthropic/claude-fable-5 --target-lang \"German\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1800,
   "id": "fra-deu-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-fra-deu-dev",
   "corpus_file": "datasets/curated/fra-deu-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 151,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3008,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00048575,
   "run_command": "mt-eval run --corpus tatoeba-fra-deu-dev --model anthropic/claude-fable-5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1801,
   "id": "eng-tha-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3013,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00048494,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Thai\" --yes"
  },
  {
   "priority": 1802,
   "id": "eng-mlt-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1952,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00047906,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model openai/gpt-5.5 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 1803,
   "id": "eng-ibo-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0632,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00047857,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model openai/gpt-5.5 --target-lang \"Igbo\" --yes"
  },
  {
   "priority": 1804,
   "id": "eng-yor-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1355,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00047641,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Yoruba\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1805,
   "id": "rus-cmn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3068,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00047625,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1806,
   "id": "rus-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "rus>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-rus-cmn-dev",
   "corpus_file": "datasets/curated/rus-cmn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 154,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3068,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00047625,
   "run_command": "mt-eval run --corpus tatoeba-rus-cmn-dev --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1807,
   "id": "eng-xho-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>xho",
   "target_language": "Xhosa",
   "corpus_id": "tatoeba-eng-xho-dev",
   "corpus_file": "datasets/curated/eng-xho-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 75,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1996,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3248,
   "predicted_effective": 0.1624,
   "expected_mesh_gain": 9.492e-05,
   "ecv_per_usd": 0.00047553,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-xho-dev-v1.json && mt-eval run --corpus eng-xho-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Xhosa\" --yes"
  },
  {
   "priority": 1808,
   "id": "fra-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3088,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00047317,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1809,
   "id": "fra-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-fra-jpn-dev",
   "corpus_file": "datasets/curated/fra-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 155,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3088,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00047317,
   "run_command": "mt-eval run --corpus tatoeba-fra-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1810,
   "id": "eng-tam-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2076,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00047156,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Tamil\" --yes"
  },
  {
   "priority": 1811,
   "id": "fra-cat-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.0628,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00047045,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1812,
   "id": "ita-rus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3128,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00046711,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1813,
   "id": "ita-rus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "ita>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-ita-rus-dev",
   "corpus_file": "datasets/curated/ita-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 157,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3128,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00046711,
   "run_command": "mt-eval run --corpus tatoeba-ita-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1814,
   "id": "eng-sin-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>sin",
   "target_language": "Sinhala",
   "corpus_id": "tatoeba-eng-sin-dev",
   "corpus_file": "datasets/curated/eng-sin-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 69,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1375,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2189,
   "predicted_effective": 0.1095,
   "expected_mesh_gain": 6.397e-05,
   "ecv_per_usd": 0.00046523,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sin-dev-v1.json && mt-eval run --corpus eng-sin-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Sinhala\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1815,
   "id": "eng-sna-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.0936,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00046269,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Shona\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1816,
   "id": "eng-mya-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.123,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00046234,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model openai/gpt-5.5 --target-lang \"Burmese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1817,
   "id": "rus-kor-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.3163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00046195,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1818,
   "id": "rus-kor-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3163,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00046195,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model openai/gpt-5.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1819,
   "id": "eng-mon-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.1813,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00046099,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Mongolian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1820,
   "id": "eng-zul-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>zul",
   "target_language": "Zulu",
   "corpus_id": "tatoeba-eng-zul-dev",
   "corpus_file": "datasets/curated/eng-zul-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 112,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3171,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00046078,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zul-dev-v1.json && mt-eval run --corpus eng-zul-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Zulu\" --yes"
  },
  {
   "priority": 1821,
   "id": "deu-eus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3188,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045832,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1822,
   "id": "deu-eus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-deu-eus-dev",
   "corpus_file": "datasets/curated/deu-eus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 160,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3188,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045832,
   "run_command": "mt-eval run --corpus tatoeba-deu-eus-dev --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1823,
   "id": "jpn-vie-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1824,
   "id": "rus-kaz-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model openai/gpt-5.5 --target-lang \"Kazakh\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1825,
   "id": "eng-arb-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model openai/gpt-5.5 --target-lang \"Standard Arabic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1826,
   "id": "eng-cmn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model openai/gpt-5.5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1827,
   "id": "eng-deu-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model openai/gpt-5.5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1828,
   "id": "eng-fra-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model openai/gpt-5.5 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1829,
   "id": "eng-jpn-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model openai/gpt-5.5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1830,
   "id": "eng-kor-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model openai/gpt-5.5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1831,
   "id": "eng-nld-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model openai/gpt-5.5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1832,
   "id": "eng-por-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model openai/gpt-5.5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1833,
   "id": "eng-spa-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model openai/gpt-5.5 --target-lang \"Spanish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1834,
   "id": "eng-tgl-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model openai/gpt-5.5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1835,
   "id": "eng-tha-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model openai/gpt-5.5 --target-lang \"Thai\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1836,
   "id": "eng-vie-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1837,
   "id": "jpn-vie-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model openai/gpt-5.5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1838,
   "id": "rus-kaz-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3195,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045732,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model openai/gpt-5.5 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1839,
   "id": "eng-lug-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.2017,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00045638,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Ganda\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1840,
   "id": "rus-vie-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3208,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045547,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1841,
   "id": "rus-vie-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "rus>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-rus-vie-dev",
   "corpus_file": "datasets/curated/rus-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 161,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3208,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045547,
   "run_command": "mt-eval run --corpus tatoeba-rus-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1842,
   "id": "fra-ltz-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1909,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00045464,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 1843,
   "id": "eng-mlt-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2061,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00045372,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model openai/gpt-5.5 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1844,
   "id": "ita-mlt-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1984,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00045071,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1845,
   "id": "eng-haw-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2549,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00045055,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Hawaiian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1846,
   "id": "fra-rus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3247,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1847,
   "id": "fra-rus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-fra-rus-dev",
   "corpus_file": "datasets/curated/fra-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 163,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3247,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00045,
   "run_command": "mt-eval run --corpus tatoeba-fra-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1848,
   "id": "eng-war-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2093,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00044958,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model openai/gpt-5.5 --target-lang \"Waray\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1849,
   "id": "ita-mlt-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.199,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00044935,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 1850,
   "id": "eng-mon-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.186,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00044934,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Mongolian\" --yes"
  },
  {
   "priority": 1851,
   "id": "eng-bos-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.1023,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.00044905,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model openai/gpt-5.5 --target-lang \"Bosnian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1852,
   "id": "eng-tir-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1076,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00044839,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Tigrinya\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1853,
   "id": "fra-eus-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.074,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00044742,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes"
  },
  {
   "priority": 1854,
   "id": "fra-eus-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0741,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00044682,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model openai/gpt-5.5 --target-lang \"Basque\" --yes"
  },
  {
   "priority": 1855,
   "id": "eng-deu-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3275,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00044615,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"German\" --yes"
  },
  {
   "priority": 1856,
   "id": "eng-urd-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1995,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.0004453,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Urdu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1857,
   "id": "eng-dan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3287,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00044452,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1858,
   "id": "eng-dan-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>dan",
   "target_language": "Danish",
   "corpus_id": "tatoeba-eng-dan-dev",
   "corpus_file": "datasets/curated/eng-dan-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3287,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00044452,
   "run_command": "mt-eval run --corpus tatoeba-eng-dan-dev --model anthropic/claude-fable-5 --target-lang \"Danish\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1859,
   "id": "eng-guj-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>guj",
   "target_language": "Gujarati",
   "corpus_id": "tatoeba-eng-guj-dev",
   "corpus_file": "datasets/curated/eng-guj-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 165,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3287,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00044452,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-guj-dev-v1.json && mt-eval run --corpus eng-guj-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Gujarati\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1860,
   "id": "eng-vie-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3291,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00044398,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes"
  },
  {
   "priority": 1861,
   "id": "eng-hau-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2237,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00044154,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model openai/gpt-5.5 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1862,
   "id": "eng-haw-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2604,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00044103,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Hawaiian\" --yes"
  },
  {
   "priority": 1863,
   "id": "rus-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3347,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00043655,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1864,
   "id": "rus-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "rus>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-rus-jpn-dev",
   "corpus_file": "datasets/curated/rus-jpn-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3347,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00043655,
   "run_command": "mt-eval run --corpus tatoeba-rus-jpn-dev --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1865,
   "id": "eng-yor-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>yor",
   "target_language": "Yoruba",
   "corpus_id": "tatoeba-eng-yor-dev",
   "corpus_file": "datasets/curated/eng-yor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1483,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2209,
   "predicted_effective": 0.1105,
   "expected_mesh_gain": 6.455e-05,
   "ecv_per_usd": 0.00043529,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-yor-dev-v1.json && mt-eval run --corpus eng-yor-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Yoruba\" --yes"
  },
  {
   "priority": 1866,
   "id": "eng-ibo-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>ibo",
   "target_language": "Igbo",
   "corpus_id": "tatoeba-eng-ibo-dev",
   "corpus_file": "datasets/curated/eng-ibo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 35,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.0697,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1035,
   "predicted_effective": 0.0517,
   "expected_mesh_gain": 3.025e-05,
   "ecv_per_usd": 0.00043394,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ibo-dev-v1.json && mt-eval run --corpus eng-ibo-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Igbo\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1867,
   "id": "deu-ltz-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.1929,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00043327,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 1868,
   "id": "eng-cym-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0705,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00043274,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model openai/gpt-5.5 --target-lang \"Welsh\" --yes"
  },
  {
   "priority": 1869,
   "id": "nld-fry-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1067,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.0004319,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Western Frisian\" --yes"
  },
  {
   "priority": 1870,
   "id": "dan-fao-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3347,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00043131,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Faroese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1871,
   "id": "rus-uzb-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0815,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00043099,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model openai/gpt-5.5 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1872,
   "id": "rus-uzb-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0822,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00042732,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model openai/gpt-5.5 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 1873,
   "id": "fra-eus-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0775,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00042722,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1874,
   "id": "fra-hau-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2142,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00042702,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 1875,
   "id": "eng-kaz-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2143,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00042682,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Kazakh\" --yes"
  },
  {
   "priority": 1876,
   "id": "eng-ilo-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2092,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00042605,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Ilocano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1877,
   "id": "eng-haw-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2698,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00042567,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model openai/gpt-5.5 --target-lang \"Hawaiian\" --yes"
  },
  {
   "priority": 1878,
   "id": "rus-uzb-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.0829,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00042371,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Uzbek\" --yes"
  },
  {
   "priority": 1879,
   "id": "deu-ltz-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.1973,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.0004236,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1880,
   "id": "eng-urd-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2105,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00042203,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Urdu\" --yes"
  },
  {
   "priority": 1881,
   "id": "eng-uzb-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2194,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00042089,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1882,
   "id": "eng-pam-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0767,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00042062,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model openai/gpt-5.5 --target-lang \"Kapampangan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1883,
   "id": "fra-hau-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2207,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00041444,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1884,
   "id": "cmn-kor-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3526,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00041439,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1885,
   "id": "cmn-kor-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "cmn>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-cmn-kor-dev",
   "corpus_file": "datasets/curated/cmn-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 177,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3526,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00041439,
   "run_command": "mt-eval run --corpus tatoeba-cmn-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1886,
   "id": "eng-jpn-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3535,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00041333,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes"
  },
  {
   "priority": 1887,
   "id": "eng-zsm-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2365,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00041147,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model openai/gpt-5.5 --target-lang \"Standard Malay\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1888,
   "id": "eng-ceb-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2109,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00040876,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model openai/gpt-5.5 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1889,
   "id": "eng-kaz-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-eng-kaz-dev",
   "corpus_file": "datasets/curated/eng-kaz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 113,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2251,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00040634,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kaz-dev-v1.json && mt-eval run --corpus eng-kaz-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1890,
   "id": "eng-cym-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0751,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00040624,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model openai/gpt-5.5 --target-lang \"Welsh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1891,
   "id": "eng-mal-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1175,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.0004034,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Malayalam\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1892,
   "id": "eng-hil-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>hil",
   "target_language": "Hiligaynon",
   "corpus_id": "tatoeba-eng-hil-dev",
   "corpus_file": "datasets/curated/eng-hil-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 56,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1518,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2095,
   "predicted_effective": 0.1047,
   "expected_mesh_gain": 6.122e-05,
   "ecv_per_usd": 0.0004033,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hil-dev-v1.json && mt-eval run --corpus eng-hil-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Hiligaynon\" --yes"
  },
  {
   "priority": 1893,
   "id": "fra-ltz-dev-v1__google_gemini-3.5-flash__coached",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "google/gemini-3.5-flash",
   "condition": "coached",
   "est_cost_usd": 0.216,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00040181,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1894,
   "id": "eng-war-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2344,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00040144,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model openai/gpt-5.5 --target-lang \"Waray\" --yes"
  },
  {
   "priority": 1895,
   "id": "eng-tam-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2444,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00040056,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model openai/gpt-5.5 --target-lang \"Tamil\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1896,
   "id": "eng-ilo-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.223,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00039968,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model openai/gpt-5.5 --target-lang \"Ilocano\" --yes"
  },
  {
   "priority": 1897,
   "id": "eng-kan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>kan",
   "target_language": "Kannada",
   "corpus_id": "tatoeba-eng-kan-dev",
   "corpus_file": "datasets/curated/eng-kan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 61,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1728,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2358,
   "predicted_effective": 0.1179,
   "expected_mesh_gain": 6.891e-05,
   "ecv_per_usd": 0.00039877,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kan-dev-v1.json && mt-eval run --corpus eng-kan-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Kannada\" --yes"
  },
  {
   "priority": 1898,
   "id": "nld-fry-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "nld>fry",
   "target_language": "Western Frisian",
   "corpus_id": "tatoeba-nld-fry-dev",
   "corpus_file": "datasets/curated/nld-fry-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1156,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1577,
   "predicted_effective": 0.0789,
   "expected_mesh_gain": 4.608e-05,
   "ecv_per_usd": 0.00039865,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/nld-fry-dev-v1.json && mt-eval run --corpus nld-fry-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Western Frisian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1899,
   "id": "eng-ita-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3666,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00039856,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1900,
   "id": "eng-ita-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>ita",
   "target_language": "Italian",
   "corpus_id": "tatoeba-eng-ita-dev",
   "corpus_file": "datasets/curated/eng-ita-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 184,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3666,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00039856,
   "run_command": "mt-eval run --corpus tatoeba-eng-ita-dev --model anthropic/claude-fable-5 --target-lang \"Italian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1901,
   "id": "eng-sna-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>sna",
   "target_language": "Shona",
   "corpus_id": "tatoeba-eng-sna-dev",
   "corpus_file": "datasets/curated/eng-sna-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1091,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1482,
   "predicted_effective": 0.0741,
   "expected_mesh_gain": 4.331e-05,
   "ecv_per_usd": 0.00039696,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sna-dev-v1.json && mt-eval run --corpus eng-sna-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Shona\" --yes"
  },
  {
   "priority": 1902,
   "id": "fra-cat-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.0749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00039445,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1903,
   "id": "eng-rus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3706,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00039426,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1904,
   "id": "eng-rus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>rus",
   "target_language": "Russian",
   "corpus_id": "tatoeba-eng-rus-dev",
   "corpus_file": "datasets/curated/eng-rus-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 186,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3706,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00039426,
   "run_command": "mt-eval run --corpus tatoeba-eng-rus-dev --model anthropic/claude-fable-5 --target-lang \"Russian\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1905,
   "id": "eng-amh-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>amh",
   "target_language": "Amharic",
   "corpus_id": "tatoeba-eng-amh-dev",
   "corpus_file": "datasets/curated/eng-amh-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 73,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.235,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3119,
   "predicted_effective": 0.156,
   "expected_mesh_gain": 9.115e-05,
   "ecv_per_usd": 0.00038785,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-amh-dev-v1.json && mt-eval run --corpus eng-amh-dev-v1.json --model openai/gpt-5.5 --target-lang \"Amharic\" --yes"
  },
  {
   "priority": 1906,
   "id": "eng-lug-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2404,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00038291,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Ganda\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1907,
   "id": "eng-mon-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2205,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00037903,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model openai/gpt-5.5 --target-lang \"Mongolian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1908,
   "id": "ita-mlt-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2365,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.0003781,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1909,
   "id": "eng-urd-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2378,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00037358,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Urdu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1910,
   "id": "eng-mya-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>mya",
   "target_language": "Burmese",
   "corpus_id": "tatoeba-eng-mya-dev",
   "corpus_file": "datasets/curated/eng-mya-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 77,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1534,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1946,
   "predicted_effective": 0.0973,
   "expected_mesh_gain": 5.687e-05,
   "ecv_per_usd": 0.00037071,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mya-dev-v1.json && mt-eval run --corpus eng-mya-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Burmese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1911,
   "id": "eng-haw-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.31,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00037047,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model openai/gpt-5.5 --target-lang \"Hawaiian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1912,
   "id": "rus-kor-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3945,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00037038,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1913,
   "id": "rus-kor-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "rus>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-rus-kor-dev",
   "corpus_file": "datasets/curated/rus-kor-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 198,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3945,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00037038,
   "run_command": "mt-eval run --corpus tatoeba-rus-kor-dev --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1914,
   "id": "jpn-vie-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1915,
   "id": "rus-kaz-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model anthropic/claude-fable-5 --target-lang \"Kazakh\" --yes",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1916,
   "id": "eng-arb-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>arb",
   "target_language": "Standard Arabic",
   "corpus_id": "tatoeba-eng-arb-dev",
   "corpus_file": "datasets/curated/eng-arb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-arb-dev-v1.json && mt-eval run --corpus eng-arb-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Standard Arabic\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1917,
   "id": "eng-cmn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>cmn",
   "target_language": "Mandarin Chinese",
   "corpus_id": "tatoeba-eng-cmn-dev",
   "corpus_file": "datasets/curated/eng-cmn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cmn-dev-v1.json && mt-eval run --corpus eng-cmn-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Mandarin Chinese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1918,
   "id": "eng-deu-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>deu",
   "target_language": "German",
   "corpus_id": "tatoeba-eng-deu-dev",
   "corpus_file": "datasets/curated/eng-deu-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-deu-dev-v1.json && mt-eval run --corpus eng-deu-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"German\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1919,
   "id": "eng-fra-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>fra",
   "target_language": "French",
   "corpus_id": "tatoeba-eng-fra-dev",
   "corpus_file": "datasets/curated/eng-fra-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-fra-dev-v1.json && mt-eval run --corpus eng-fra-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"French\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1920,
   "id": "eng-jpn-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>jpn",
   "target_language": "Japanese",
   "corpus_id": "tatoeba-eng-jpn-dev",
   "corpus_file": "datasets/curated/eng-jpn-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-jpn-dev-v1.json && mt-eval run --corpus eng-jpn-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Japanese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1921,
   "id": "eng-kor-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>kor",
   "target_language": "Korean",
   "corpus_id": "tatoeba-eng-kor-dev",
   "corpus_file": "datasets/curated/eng-kor-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-kor-dev-v1.json && mt-eval run --corpus eng-kor-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Korean\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1922,
   "id": "eng-nld-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>nld",
   "target_language": "Dutch",
   "corpus_id": "tatoeba-eng-nld-dev",
   "corpus_file": "datasets/curated/eng-nld-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-nld-dev-v1.json && mt-eval run --corpus eng-nld-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Dutch\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1923,
   "id": "eng-por-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>por",
   "target_language": "Portuguese (Brazilian)",
   "corpus_id": "tatoeba-eng-por-dev",
   "corpus_file": "datasets/curated/eng-por-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-por-dev-v1.json && mt-eval run --corpus eng-por-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Portuguese (Brazilian)\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1924,
   "id": "eng-spa-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>spa",
   "target_language": "Spanish",
   "corpus_id": "tatoeba-eng-spa-dev",
   "corpus_file": "datasets/curated/eng-spa-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-spa-dev-v1.json && mt-eval run --corpus eng-spa-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Spanish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1925,
   "id": "eng-tgl-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>tgl",
   "target_language": "Filipino",
   "corpus_id": "tatoeba-eng-tgl-dev",
   "corpus_file": "datasets/curated/eng-tgl-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tgl-dev-v1.json && mt-eval run --corpus eng-tgl-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Filipino\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1926,
   "id": "eng-tha-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>tha",
   "target_language": "Thai",
   "corpus_id": "tatoeba-eng-tha-dev",
   "corpus_file": "datasets/curated/eng-tha-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tha-dev-v1.json && mt-eval run --corpus eng-tha-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Thai\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1927,
   "id": "eng-vie-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-eng-vie-dev",
   "corpus_file": "datasets/curated/eng-vie-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-vie-dev-v1.json && mt-eval run --corpus eng-vie-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1928,
   "id": "jpn-vie-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "jpn>vie",
   "target_language": "Vietnamese",
   "corpus_id": "tatoeba-jpn-vie-dev",
   "corpus_file": "datasets/curated/jpn-vie-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "mt-eval run --corpus tatoeba-jpn-vie-dev --model anthropic/claude-fable-5 --target-lang \"Vietnamese\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1929,
   "id": "rus-kaz-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "rus>kaz",
   "target_language": "Kazakh",
   "corpus_id": "tatoeba-rus-kaz-dev",
   "corpus_file": "datasets/curated/rus-kaz-dev-v1.json",
   "corpus_url": null,
   "corpus_license": "CC-BY-2.0",
   "entry_count": 200,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3985,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00036666,
   "run_command": "mt-eval run --corpus tatoeba-rus-kaz-dev --model anthropic/claude-fable-5 --target-lang \"Kazakh\" --yes --coaching-file YOUR_COACHING.txt",
   "corpus_fetch": "fetch-from-source: corpus is not hosted in the mirror; mt-eval builds it locally from the pinned Tatoeba Challenge export and verifies the registry sha256 (run from an arena checkout)",
   "source_export_url": "https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test-v2023-09-26.tar"
  },
  {
   "priority": 1930,
   "id": "fra-cat-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.0806,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00036655,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model openai/gpt-5.5 --target-lang \"Catalan\" --yes"
  },
  {
   "priority": 1931,
   "id": "eng-mlt-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.257,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00036386,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1932,
   "id": "eng-war-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.261,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00036052,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Waray\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1933,
   "id": "eng-bos-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>bos",
   "target_language": "Bosnian",
   "corpus_id": "tatoeba-eng-bos-dev",
   "corpus_file": "datasets/curated/eng-bos-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 64,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1275,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1572,
   "predicted_effective": 0.0786,
   "expected_mesh_gain": 4.594e-05,
   "ecv_per_usd": 0.0003603,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-bos-dev-v1.json && mt-eval run --corpus eng-bos-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Bosnian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1934,
   "id": "eng-pag-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1885,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00036028,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Pangasinan\" --yes"
  },
  {
   "priority": 1935,
   "id": "deu-ltz-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2336,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00035778,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 1936,
   "id": "eng-ilo-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>ilo",
   "target_language": "Ilocano",
   "corpus_id": "tatoeba-eng-ilo-dev",
   "corpus_file": "datasets/curated/eng-ilo-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 105,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2497,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.305,
   "predicted_effective": 0.1525,
   "expected_mesh_gain": 8.913e-05,
   "ecv_per_usd": 0.00035694,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ilo-dev-v1.json && mt-eval run --corpus eng-ilo-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Ilocano\" --yes"
  },
  {
   "priority": 1937,
   "id": "eng-sme-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.167,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00035627,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Northern Sámi\" --yes"
  },
  {
   "priority": 1938,
   "id": "deu-ltz-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2352,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00035534,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1939,
   "id": "eng-hau-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2789,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00035415,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1940,
   "id": "eng-mal-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>mal",
   "target_language": "Malayalam",
   "corpus_id": "tatoeba-eng-mal-dev",
   "corpus_file": "datasets/curated/eng-mal-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.135,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1622,
   "predicted_effective": 0.0811,
   "expected_mesh_gain": 4.74e-05,
   "ecv_per_usd": 0.00035111,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mal-dev-v1.json && mt-eval run --corpus eng-mal-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Malayalam\" --yes"
  },
  {
   "priority": 1941,
   "id": "fra-eus-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0943,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00035111,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model openai/gpt-5.5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1942,
   "id": "spa-que-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "spa>que",
   "target_language": "Quechua",
   "corpus_id": "tatoeba-spa-que-dev",
   "corpus_file": "datasets/curated/spa-que-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 95,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2932,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.3482,
   "predicted_effective": 0.1741,
   "expected_mesh_gain": 0.00010175,
   "ecv_per_usd": 0.00034704,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/spa-que-dev-v1.json && mt-eval run --corpus spa-que-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Quechua\" --yes"
  },
  {
   "priority": 1943,
   "id": "eng-uzb-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2668,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00034612,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model openai/gpt-5.5 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1944,
   "id": "fra-ltz-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2508,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00034606,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 1945,
   "id": "rus-uzb-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "rus>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-rus-uzb-dev",
   "corpus_file": "datasets/curated/rus-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 51,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1016,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1202,
   "predicted_effective": 0.0601,
   "expected_mesh_gain": 3.513e-05,
   "ecv_per_usd": 0.00034572,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/rus-uzb-dev-v1.json && mt-eval run --corpus rus-uzb-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1946,
   "id": "eng-hau-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-eng-hau-dev",
   "corpus_file": "datasets/curated/eng-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 140,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2864,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.338,
   "predicted_effective": 0.169,
   "expected_mesh_gain": 9.877e-05,
   "ecv_per_usd": 0.00034488,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-hau-dev-v1.json && mt-eval run --corpus eng-hau-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 1947,
   "id": "eng-tuk-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>tuk",
   "target_language": "Turkmen",
   "corpus_id": "tatoeba-eng-tuk-dev",
   "corpus_file": "datasets/curated/eng-tuk-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 118,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.4238,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.5,
   "predicted_effective": 0.25,
   "expected_mesh_gain": 0.00014611,
   "ecv_per_usd": 0.00034477,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tuk-dev-v1.json && mt-eval run --corpus eng-tuk-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Turkmen\" --yes"
  },
  {
   "priority": 1948,
   "id": "eng-mlt-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2713,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00034468,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 1949,
   "id": "dan-fao-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.4203,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00034347,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model openai/gpt-5.5 --target-lang \"Faroese\" --yes"
  },
  {
   "priority": 1950,
   "id": "fra-hau-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2684,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00034079,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model openai/gpt-5.5 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1951,
   "id": "eng-pam-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.0956,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00033747,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Kapampangan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1952,
   "id": "fra-ltz-dev-v1__google_gemini-3.1-pro-preview__coached",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "coached",
   "est_cost_usd": 0.2575,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00033705,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1953,
   "id": "eng-lug-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2734,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00033669,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model openai/gpt-5.5 --target-lang \"Ganda\" --yes"
  },
  {
   "priority": 1954,
   "id": "fra-hau-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2732,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.0003348,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model openai/gpt-5.5 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 1955,
   "id": "eng-zsm-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>zsm",
   "target_language": "Standard Malay",
   "corpus_id": "tatoeba-eng-zsm-dev",
   "corpus_file": "datasets/curated/eng-zsm-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 148,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2949,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.333,
   "predicted_effective": 0.1665,
   "expected_mesh_gain": 9.731e-05,
   "ecv_per_usd": 0.00032998,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-zsm-dev-v1.json && mt-eval run --corpus eng-zsm-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Standard Malay\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1956,
   "id": "eng-ceb-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.263,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00032778,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Cebuano\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1957,
   "id": "eng-cym-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.0936,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00032595,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Welsh\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1958,
   "id": "ita-mlt-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.2757,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00032434,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 1959,
   "id": "fra-cat-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.0911,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.0003243,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model openai/gpt-5.5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1960,
   "id": "fra-eus-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.103,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00032145,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes"
  },
  {
   "priority": 1961,
   "id": "eng-tam-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>tam",
   "target_language": "Tamil",
   "corpus_id": "tatoeba-eng-tam-dev",
   "corpus_file": "datasets/curated/eng-tam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 153,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3048,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.335,
   "predicted_effective": 0.1675,
   "expected_mesh_gain": 9.79e-05,
   "ecv_per_usd": 0.00032118,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tam-dev-v1.json && mt-eval run --corpus eng-tam-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Tamil\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1962,
   "id": "eng-ceb-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>ceb",
   "target_language": "Cebuano",
   "corpus_id": "tatoeba-eng-ceb-dev",
   "corpus_file": "datasets/curated/eng-ceb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 132,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2689,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.295,
   "predicted_effective": 0.1475,
   "expected_mesh_gain": 8.621e-05,
   "ecv_per_usd": 0.00032059,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-ceb-dev-v1.json && mt-eval run --corpus eng-ceb-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Cebuano\" --yes"
  },
  {
   "priority": 1963,
   "id": "eng-lug-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2924,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00031481,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model openai/gpt-5.5 --target-lang \"Ganda\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1964,
   "id": "dan-fao-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "dan>fao",
   "target_language": "Faroese",
   "corpus_id": "tatoeba-dan-fao-dev",
   "corpus_file": "datasets/curated/dan-fao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.4601,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.494,
   "predicted_effective": 0.247,
   "expected_mesh_gain": 0.00014436,
   "ecv_per_usd": 0.00031376,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/dan-fao-dev-v1.json && mt-eval run --corpus dan-fao-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Faroese\" --yes"
  },
  {
   "priority": 1965,
   "id": "fra-cat-dev-v1__google_gemini-3.1-pro-preview__naive",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "google/gemini-3.1-pro-preview",
   "condition": "naive",
   "est_cost_usd": 0.095,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00031099,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model google/gemini-3.1-pro-preview --target-lang \"Catalan\" --yes"
  },
  {
   "priority": 1966,
   "id": "ita-mlt-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2876,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00031092,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model openai/gpt-5.5 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1967,
   "id": "fra-hau-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2947,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00031037,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Hausa\" --yes"
  },
  {
   "priority": 1968,
   "id": "eng-urd-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.2892,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00030718,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model openai/gpt-5.5 --target-lang \"Urdu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1969,
   "id": "eng-mon-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>mon",
   "target_language": "Mongolian",
   "corpus_id": "tatoeba-eng-mon-dev",
   "corpus_file": "datasets/curated/eng-mon-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 138,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.2749,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00030403,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mon-dev-v1.json && mt-eval run --corpus eng-mon-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Mongolian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1970,
   "id": "eng-war-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>war",
   "target_language": "Waray",
   "corpus_id": "tatoeba-eng-war-dev",
   "corpus_file": "datasets/curated/eng-war-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 131,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3107,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.322,
   "predicted_effective": 0.161,
   "expected_mesh_gain": 9.41e-05,
   "ecv_per_usd": 0.00030285,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-war-dev-v1.json && mt-eval run --corpus eng-war-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Waray\" --yes"
  },
  {
   "priority": 1971,
   "id": "eng-pag-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>pag",
   "target_language": "Pangasinan",
   "corpus_id": "tatoeba-eng-pag-dev",
   "corpus_file": "datasets/curated/eng-pag-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 60,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2273,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2324,
   "predicted_effective": 0.1162,
   "expected_mesh_gain": 6.791e-05,
   "ecv_per_usd": 0.00029878,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pag-dev-v1.json && mt-eval run --corpus eng-pag-dev-v1.json --model openai/gpt-5.5 --target-lang \"Pangasinan\" --yes"
  },
  {
   "priority": 1972,
   "id": "eng-haw-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3865,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00029714,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Hawaiian\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1973,
   "id": "eng-pam-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1087,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.0002968,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Kapampangan\" --yes"
  },
  {
   "priority": 1974,
   "id": "eng-pan-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>pan",
   "target_language": "Panjabi",
   "corpus_id": "tatoeba-eng-pan-dev",
   "corpus_file": "datasets/curated/eng-pan-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.2797,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00029296,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pan-dev-v1.json && mt-eval run --corpus eng-pan-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Panjabi\" --yes"
  },
  {
   "priority": 1975,
   "id": "deu-ltz-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.286,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00029223,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model openai/gpt-5.5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1976,
   "id": "eng-tir-dev-v1__anthropic_claude-haiku-4.5__naive",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "anthropic/claude-haiku-4.5",
   "condition": "naive",
   "est_cost_usd": 0.1683,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00028667,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model anthropic/claude-haiku-4.5 --target-lang \"Tigrinya\" --yes"
  },
  {
   "priority": 1977,
   "id": "eng-sme-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>sme",
   "target_language": "Northern Sámi",
   "corpus_id": "tatoeba-eng-sme-dev",
   "corpus_file": "datasets/curated/eng-sme-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 58,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.2104,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2036,
   "predicted_effective": 0.1018,
   "expected_mesh_gain": 5.95e-05,
   "ecv_per_usd": 0.00028278,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-sme-dev-v1.json && mt-eval run --corpus eng-sme-dev-v1.json --model openai/gpt-5.5 --target-lang \"Northern Sámi\" --yes"
  },
  {
   "priority": 1978,
   "id": "fra-eus-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>eus",
   "target_language": "Basque",
   "corpus_id": "tatoeba-fra-eus-dev",
   "corpus_file": "datasets/curated/fra-eus-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 59,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1175,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1133,
   "predicted_effective": 0.0566,
   "expected_mesh_gain": 3.311e-05,
   "ecv_per_usd": 0.00028178,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-eus-dev-v1.json && mt-eval run --corpus fra-eus-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Basque\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1979,
   "id": "eng-uzb-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>uzb",
   "target_language": "Uzbek",
   "corpus_id": "tatoeba-eng-uzb-dev",
   "corpus_file": "datasets/curated/eng-uzb-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 167,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3327,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.316,
   "predicted_effective": 0.158,
   "expected_mesh_gain": 9.234e-05,
   "ecv_per_usd": 0.00027756,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-uzb-dev-v1.json && mt-eval run --corpus eng-uzb-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Uzbek\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1980,
   "id": "fra-ltz-dev-v1__openai_gpt-5.5__coached",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "openai/gpt-5.5",
   "condition": "coached",
   "est_cost_usd": 0.3132,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00027711,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model openai/gpt-5.5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1981,
   "id": "fra-hau-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>hau",
   "target_language": "Hausa",
   "corpus_id": "tatoeba-fra-hau-dev",
   "corpus_file": "datasets/curated/fra-hau-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 168,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3347,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.313,
   "predicted_effective": 0.1565,
   "expected_mesh_gain": 9.147e-05,
   "ecv_per_usd": 0.00027328,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-hau-dev-v1.json && mt-eval run --corpus fra-hau-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Hausa\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1982,
   "id": "eng-tir-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.177,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00027258,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model openai/gpt-5.5 --target-lang \"Tigrinya\" --yes"
  },
  {
   "priority": 1983,
   "id": "eng-cym-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>cym",
   "target_language": "Welsh",
   "corpus_id": "tatoeba-eng-cym-dev",
   "corpus_file": "datasets/curated/eng-cym-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 47,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1133,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1044,
   "predicted_effective": 0.0522,
   "expected_mesh_gain": 3.051e-05,
   "ecv_per_usd": 0.00026927,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-cym-dev-v1.json && mt-eval run --corpus eng-cym-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Welsh\" --yes"
  },
  {
   "priority": 1984,
   "id": "ita-mlt-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3336,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00026805,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 1985,
   "id": "eng-tir-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>tir",
   "target_language": "Tigrinya",
   "corpus_id": "tatoeba-eng-tir-dev",
   "corpus_file": "datasets/curated/eng-tir-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 54,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.1834,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1651,
   "predicted_effective": 0.0825,
   "expected_mesh_gain": 4.825e-05,
   "ecv_per_usd": 0.00026307,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-tir-dev-v1.json && mt-eval run --corpus eng-tir-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Tigrinya\" --yes"
  },
  {
   "priority": 1986,
   "id": "deu-ltz-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3201,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.0002611,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 1987,
   "id": "eng-lao-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3144,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00026062,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Lao\" --yes"
  },
  {
   "priority": 1988,
   "id": "fra-cat-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>cat",
   "target_language": "Catalan",
   "corpus_id": "tatoeba-fra-cat-dev",
   "corpus_file": "datasets/curated/fra-cat-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 57,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.1136,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1011,
   "predicted_effective": 0.0505,
   "expected_mesh_gain": 2.954e-05,
   "ecv_per_usd": 0.00026007,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-cat-dev-v1.json && mt-eval run --corpus fra-cat-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Catalan\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1989,
   "id": "fra-ltz-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.334,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00025985,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model openai/gpt-5.5 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 1990,
   "id": "eng-lug-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3576,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00025741,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Ganda\" --yes"
  },
  {
   "priority": 1991,
   "id": "eng-lug-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>lug",
   "target_language": "Ganda",
   "corpus_id": "tatoeba-eng-lug-dev",
   "corpus_file": "datasets/curated/eng-lug-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 183,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3646,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.315,
   "predicted_effective": 0.1575,
   "expected_mesh_gain": 9.205e-05,
   "ecv_per_usd": 0.00025247,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lug-dev-v1.json && mt-eval run --corpus eng-lug-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Ganda\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1992,
   "id": "ita-mlt-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3586,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00024936,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Maltese\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1993,
   "id": "eng-urd-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "eng>urd",
   "target_language": "Urdu",
   "corpus_id": "tatoeba-eng-urd-dev",
   "corpus_file": "datasets/curated/eng-urd-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 181,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3606,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.304,
   "predicted_effective": 0.152,
   "expected_mesh_gain": 8.884e-05,
   "ecv_per_usd": 0.00024636,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-urd-dev-v1.json && mt-eval run --corpus eng-urd-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Urdu\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1994,
   "id": "deu-ltz-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.3414,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00024481,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model openai/gpt-5.5 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 1995,
   "id": "eng-mlt-dev-v1__google_gemini-3.5-flash__naive",
   "language_pair": "eng>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-eng-mlt-dev",
   "corpus_file": "datasets/curated/eng-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 129,
   "model": "google/gemini-3.5-flash",
   "condition": "naive",
   "est_cost_usd": 0.3864,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.32,
   "predicted_effective": 0.16,
   "expected_mesh_gain": 9.351e-05,
   "ecv_per_usd": 0.00024201,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-mlt-dev-v1.json && mt-eval run --corpus eng-mlt-dev-v1.json --model google/gemini-3.5-flash --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 1996,
   "id": "deu-ltz-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "deu>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-deu-ltz-dev",
   "corpus_file": "datasets/curated/deu-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 179,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3566,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.286,
   "predicted_effective": 0.143,
   "expected_mesh_gain": 8.358e-05,
   "ecv_per_usd": 0.00023437,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/deu-ltz-dev-v1.json && mt-eval run --corpus deu-ltz-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 1997,
   "id": "fra-ltz-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.3773,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00023003,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Luxembourgish\" --yes"
  },
  {
   "priority": 1998,
   "id": "eng-haw-dev-v1__anthropic_claude-fable-5__naive",
   "language_pair": "eng>haw",
   "target_language": "Hawaiian",
   "corpus_id": "tatoeba-eng-haw-dev",
   "corpus_file": "datasets/curated/eng-haw-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 194,
   "model": "anthropic/claude-fable-5",
   "condition": "naive",
   "est_cost_usd": 0.5128,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.393,
   "predicted_effective": 0.1965,
   "expected_mesh_gain": 0.00011485,
   "ecv_per_usd": 0.00022396,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-haw-dev-v1.json && mt-eval run --corpus eng-haw-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Hawaiian\" --yes"
  },
  {
   "priority": 1999,
   "id": "fra-ltz-dev-v1__anthropic_claude-fable-5__coached",
   "language_pair": "fra>ltz",
   "target_language": "Luxembourgish",
   "corpus_id": "tatoeba-fra-ltz-dev",
   "corpus_file": "datasets/curated/fra-ltz-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 196,
   "model": "anthropic/claude-fable-5",
   "condition": "coached",
   "est_cost_usd": 0.3905,
   "est_basis": "extrapolated: sweep avg cost/entry for this model x corpus entry count (naive condition; coached runs add system-prompt tokens, expect slightly more)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.297,
   "predicted_effective": 0.1485,
   "expected_mesh_gain": 8.679e-05,
   "ecv_per_usd": 0.00022226,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/fra-ltz-dev-v1.json && mt-eval run --corpus fra-ltz-dev-v1.json --model anthropic/claude-fable-5 --target-lang \"Luxembourgish\" --yes --coaching-file YOUR_COACHING.txt"
  },
  {
   "priority": 2000,
   "id": "ita-mlt-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "ita>mlt",
   "target_language": "Maltese",
   "corpus_id": "tatoeba-ita-mlt-dev",
   "corpus_file": "datasets/curated/ita-mlt-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 180,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.4036,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.306,
   "predicted_effective": 0.153,
   "expected_mesh_gain": 8.942e-05,
   "ecv_per_usd": 0.00022156,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/ita-mlt-dev-v1.json && mt-eval run --corpus ita-mlt-dev-v1.json --model openai/gpt-5.5 --target-lang \"Maltese\" --yes"
  },
  {
   "priority": 2001,
   "id": "eng-pam-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>pam",
   "target_language": "Kapampangan",
   "corpus_id": "tatoeba-eng-pam-dev",
   "corpus_file": "datasets/curated/eng-pam-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 48,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.1549,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.1104,
   "predicted_effective": 0.0552,
   "expected_mesh_gain": 3.226e-05,
   "ecv_per_usd": 0.00020828,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-pam-dev-v1.json && mt-eval run --corpus eng-pam-dev-v1.json --model openai/gpt-5.5 --target-lang \"Kapampangan\" --yes"
  },
  {
   "priority": 2002,
   "id": "eng-lao-dev-v1__openai_gpt-5.5__naive",
   "language_pair": "eng>lao",
   "target_language": "Lao",
   "corpus_id": "tatoeba-eng-lao-dev",
   "corpus_file": "datasets/curated/eng-lao-dev-v1.json",
   "corpus_url": "https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json",
   "corpus_license": "CC-BY-2.0",
   "entry_count": 68,
   "model": "openai/gpt-5.5",
   "condition": "naive",
   "est_cost_usd": 0.4216,
   "est_basis": "observed (baseline sweep manifest)",
   "pair_covered_on_leaderboard": false,
   "chaining_gain": 0.000584,
   "edge_quality": 0.0,
   "edge_reliability": 0.0,
   "edge_tier": "registered",
   "effective_strength": 0.0,
   "pair_prior": 0.5,
   "prior_basis": "default",
   "model_offset": 0.0,
   "condition_offset": 0.0,
   "exploration_bonus": 0.0,
   "predicted_strength": 0.5,
   "post_run_reliability": 0.2804,
   "predicted_effective": 0.1402,
   "expected_mesh_gain": 8.194e-05,
   "ecv_per_usd": 0.00019436,
   "run_command": "curl -fsSLO https://raw.githubusercontent.com/gamedaysuits/gds-mt-eval-harness/main/datasets/curated/eng-lao-dev-v1.json && mt-eval run --corpus eng-lao-dev-v1.json --model openai/gpt-5.5 --target-lang \"Lao\" --yes"
  }
 ]
}
