schedule: "Weekly at 08:00 UTC"
blog_channel: "—"
function: "benchmarks"
"LMSYS Chatbot Arena"
"Open LLM Leaderboard (Hugging Face)"
"Papers With Code benchmarks"
"HELM (Stanford)"
"arXiv benchmark papers"
"Benchmark maintainer blogs"
"new AI benchmark 2026"
"LLM evaluation methodology"
"benchmark contamination"
"AI evaluation framework"
mutation_target: "knowledge/benchmark-state.md"
iteration_budget: 5
time_budget: "15 min"
metric: "benchmark_coverage_rate"
target: ≥ 0.8
→ Fraction of known benchmarks with current (< 30 day) data
keep: mutations that improve KPI toward target
discard: mutations that degrade KPI or timeout