Siirrä kipina-codebench projektin päätasolle

2026-04-14 09:44:14 +03:00
parent b93ae2fd1b
commit 7b27800390
24 changed files with 0 additions and 0 deletions
--- a/kipina-codebench/Dockerfile.pytest
+++ b/kipina-codebench/Dockerfile.pytest
@@ -0,0 +1,5 @@
+FROM python:3.14-slim
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+WORKDIR /work
+ENV PYTHONPATH=/work
+ENTRYPOINT ["sh", "-c", "uv init --no-readme --python '>=3.14' 2>/dev/null && rm -f hello.py main.py && uv add fastapi 'uvicorn[standard]' sqlalchemy pytest httpx 2>/dev/null && cp /src/*.py . && rm -f app.db test.db && uv run pytest test_main.py -v --tb=short 2>&1"]
--- a/kipina-codebench/README.md
+++ b/kipina-codebench/README.md
@@ -0,0 +1,95 @@
+# Kipinä CodeBench
+
+LLM-koodingenerointibenchmark. Testaa Ollama-mallien kykyä generoida toimivia FastAPI+SQLAlchemy-projekteja ja ajaa testit Docker-kontissa.
+
+## Pikastart
+
+```bash
+# 1. Rakenna Docker-testikontti
+docker build -t kipina-pytest -f Dockerfile.pytest .
+
+# 2. Aja benchmark
+node benchmark.mjs --ollama http://localhost:11434 --scenarios all
+
+# 3. Avaa raportti
+open /tmp/kipina-benchmark/report.html
+```
+
+## Pipeline
+
+```
+1. LLM → vaatimusmäärittely (prompts/client.md)
+2. LLM → JSON-speksi (prompts/spec.md)
+3. LLM → 4 Python-tiedostoa (prompts/code.md + golden-examples/)
+4. Staattinen validointi + LLM-korjaus (prompts/fix.md)
+5. Docker: uv init + uv add + pytest
+```
+
+## CLI-argumentit
+
+| Argumentti | Oletus | Kuvaus |
+|-----------|--------|--------|
+| `--ollama` | `http://localhost:11434` | Ollama-palvelimen URL |
+| `--hub` | - | Hub-reitti (vaihtoehto Ollamalle) |
+| `--models` | kaikki | Pilkuilla erotettu mallilista |
+| `--scenarios` | `default` (todo) | `all` = todo, users, blog |
+| `--output` | `/tmp/kipina-benchmark` | Tuloshakemisto |
+
+## Hakemistorakenne
+
+```
+kipina-codebench/
+├── benchmark.mjs            ← runner
+├── Dockerfile.pytest        ← Python 3.14 + uv testikontti
+├── report-template.html     ← HTML-raporttipohja
+├── package.json
+├── prompts/                 ← muokattavat promptit
+│   ├── client.md            ← vaatimusmäärittely
+│   ├── spec.md              ← JSON-speksi
+│   ├── code.md              ← koodigenerointi
+│   └── fix.md               ← korjaus
+├── golden-examples/         ← referenssitoteutukset
+│   ├── todo/                ← taso 1: perus-CRUD (6 testiä)
+│   ├── blog/                ← taso 2: relaatiot (13 testiä)
+│   └── DOCUMENTATION.md     ← zensical-dokumentointiohjeet
+└── results/                 ← tallennetut tulokset
+```
+
+## Promptien muokkaus
+
+Promptit ovat `prompts/`-kansiossa Markdown-tiedostoina. Muokkaa suoraan — benchmark lataa ne käynnistyksessä.
+
+Esimerkki: lisää sääntö `prompts/code.md`:hen:
+```
+- Tests: PUT/update test data MUST include ALL required fields
+```
+
+## Kultaiset esimerkit
+
+`golden-examples/todo/` syötetään LLM:lle referenssinä. Malli näkee tarkalleen millaista koodia odotetaan:
+- SQLAlchemy 2.0 (DeclarativeBase, Mapped, mapped_column)
+- Pydantic v2 (ConfigDict)
+- Python 3.14 syntaksi (str | None)
+- Uniikki testidata per testi
+
+Lisää uusia esimerkkejä luomalla hakemisto (esim. `golden-examples/shop/`).
+
+## Pisteytys
+
+| Komponentti | Pisteet | Peruste |
+|---|---|---|
+| Speksi OK | 10p | JSON-speksi onnistui |
+| Koodi generoitu | 10p | Kaikki 4 tiedostoa syntyneet |
+| Testit | 0–60p | passed/total × 60 |
+| Korjaukset | 0–20p | 0 kierrosta = 20p, 1 = 10p, 2+ = 0p |
+
+Tähdet: ★★★★★ (90+), ★★★★☆ (70+), ★★★☆☆ (50+), ★★☆☆☆ (25+), ★☆☆☆☆ (1+)
+
+## Käyttö git-submodulena
+
+```bash
+git submodule add <repo-url> tools/codebench
+cd tools/codebench
+docker build -t kipina-pytest -f Dockerfile.pytest .
+node benchmark.mjs --ollama http://localhost:11434 --scenarios all
+```
--- a/kipina-codebench/benchmark.mjs
+++ b/kipina-codebench/benchmark.mjs
@@ -0,0 +1,490 @@
+#!/usr/bin/env node
+/**
+ * Kipinä CodeBench — LLM-koodingenerointibenchmark
+ *
+ * Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa.
+ *
+ * Käyttö:
+ *   node benchmark.mjs                                    # kaikki mallit, oletusskenaario
+ *   node benchmark.mjs --models qwen3-coder:30b           # yksi malli
+ *   node benchmark.mjs --ollama http://host:11434          # eri Ollama
+ *   node benchmark.mjs --scenarios all                    # kaikki skenaariot
+ *   node benchmark.mjs --output ./results/run-001         # custom output-hakemisto
+ */
+
+import { execSync } from 'child_process';
+import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync, readdirSync } from 'fs';
+import { dirname, join } from 'path';
+import { fileURLToPath } from 'url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+// === CLI-argumentit ===
+const args = process.argv.slice(2);
+function arg(name, fallback) {
+    const i = args.indexOf(`--${name}`);
+    return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
+}
+const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434');
+const HUB_URL = arg('hub', '');
+const FILTER_MODELS = arg('models', '');
+const SCENARIO_FILTER = arg('scenarios', 'default');
+const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark');
+const MAX_FIX_ROUNDS = 2;
+
+// === Promptien lataus tiedostoista ===
+function loadPrompt(name) {
+    const path = join(__dirname, 'prompts', `${name}.md`);
+    if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`);
+    return readFileSync(path, 'utf-8').trim();
+}
+const CLIENT_SYSTEM = loadPrompt('client');
+const SPEC_SYSTEM = loadPrompt('spec');
+const CODE_SYSTEM = loadPrompt('code');
+const FIX_SYSTEM = loadPrompt('fix');
+
+// === Kultaisten esimerkkien lataus ===
+const GOLDEN_DIR = join(__dirname, 'golden-examples');
+const GOLDEN_PY_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
+
+function loadGoldenExample() {
+    const todoDir = join(GOLDEN_DIR, 'todo');
+    if (!existsSync(todoDir)) return '';
+    let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n';
+    for (const f of GOLDEN_PY_FILES) {
+        const path = join(todoDir, f);
+        if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
+    }
+    return example;
+}
+const GOLDEN_EXAMPLE = loadGoldenExample();
+
+// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
+function stripThinking(text) {
+    return text
+        .replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '')  // gemma4
+        .replace(/<think>[\s\S]*?<\/think>/g, '')                // qwen3, qwen3.5
+        .trim();
+}
+
+// === Ollama / Hub -client ===
+async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
+    const start = Date.now();
+
+    if (HUB_URL) {
+        const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
+        const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
+        });
+        if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
+        const data = await resp.json();
+        const elapsed = Date.now() - start;
+        return {
+            text: stripThinking((data.response || '').trim()),
+            tokens: data.tokens_generated || 0,
+            durationMs: elapsed,
+            tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
+        };
+    }
+
+    // Suora Ollama-reitti
+    const messages = [];
+    if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
+    messages.push({ role: 'user', content: prompt });
+
+    const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+            model,
+            messages,
+            stream: false,
+            think: false,
+            options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
+        }),
+    });
+    if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
+    const data = await resp.json();
+    const elapsed = Date.now() - start;
+    const rawContent = (data.message?.content || '').trim();
+    const thinking = (data.message?.thinking || '').trim();
+    const text = stripThinking(rawContent || thinking);
+    const evalCount = data.eval_count || 0;
+    if (!rawContent && thinking) console.log(`      ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`);
+    const evalDurationNs = data.eval_duration || 1;
+    const tokPerSec = evalCount / (evalDurationNs / 1e9);
+    return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
+}
+
+async function ollamaListModels() {
+    const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
+    const resp = await fetch(url);
+    if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
+    const data = await resp.json();
+    return (data.models || []).map(m => m.name);
+}
+
+// === Tiedostoparseri LLM-vastauksesta ===
+function parseGeneratedFiles(text) {
+    const files = {};
+    const sections = text.split(/===\s*(\S+\.(?:py|toml))\s*===/);
+    for (let i = 1; i < sections.length - 1; i += 2) {
+        const name = sections[i];
+        let content = sections[i + 1].trim();
+        content = content.replace(/^```(?:python|toml)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
+        if (content) files[name] = content + '\n';
+    }
+    return files;
+}
+
+// === Validaattori ===
+function validateProjectCode(files) {
+    const issues = [];
+    for (const [fname, code] of Object.entries(files)) {
+        if (!fname.endsWith('.py')) continue;
+        const lines = code.split('\n');
+        for (const line of lines) {
+            if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`);
+        }
+        for (const line of lines) {
+            const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
+            if (!m) continue;
+            const srcCode = files[m[1] + '.py'];
+            if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
+            const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
+            for (const name of names) {
+                if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
+            }
+        }
+        if (fname === 'schemas.py') {
+            if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
+                issues.push('ISSUE: schemas.py: date-import puuttuu');
+            if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
+                issues.push('ISSUE: schemas.py: datetime-import puuttuu');
+        }
+        for (let i = 0; i < lines.length; i++) {
+            const line = lines[i];
+            if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
+            if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
+            if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
+        }
+    }
+    return issues;
+}
+
+function extractJson(text) {
+    const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
+    if (m) text = m[1].trim();
+    let depth = 0, start = null;
+    for (let i = 0; i < text.length; i++) {
+        if (text[i] === '{') { if (depth === 0) start = i; depth++; }
+        else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
+    }
+    return null;
+}
+
+// === Testiskenaariot ===
+const SCENARIOS = [
+    { id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
+    { id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
+    { id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
+];
+
+// === Pisteytys (0–100) ja tähtiluokitus ===
+function scoreResult(r) {
+    if (r.error && r.testsTotal === 0) return 0;
+    let score = 0;
+    if (r.specOk) score += 10;
+    if (!r.error || r.testsTotal > 0) score += 10;
+    if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
+    score += Math.max(0, 20 - r.fixRounds * 10);
+    return Math.min(100, score);
+}
+function starsForScore(score) {
+    if (score >= 90) return '★★★★★';
+    if (score >= 70) return '★★★★☆';
+    if (score >= 50) return '★★★☆☆';
+    if (score >= 25) return '★★☆☆☆';
+    if (score > 0)   return '★☆☆☆☆';
+    return '☆☆☆☆☆';
+}
+
+// === Pipeline: yhdelle mallille ja skenaariolle ===
+async function runPipeline(model, scenario) {
+    const result = {
+        model, scenario: scenario.id,
+        reqOk: false, specOk: false, specEntities: 0,
+        validationIssues: 0, fixRounds: 0,
+        testsTotal: 0, testsPassed: 0, testsFailed: 0,
+        totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
+        promptChars: 0, promptTokensEst: 0,
+        score: 0, stars: '',
+        error: null,
+    };
+    const timings = [];
+    const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
+    mkdirSync(dir, { recursive: true });
+
+    try {
+        // 1. Vaatimukset
+        console.log(`    [1/5] Vaatimukset...`);
+        const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 2048);
+        timings.push(req);
+        if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
+        result.reqOk = true;
+        writeFileSync(`${dir}/_requirements.txt`, req.text);
+
+        // 2. JSON-speksi
+        console.log(`    [2/5] JSON-speksi...`);
+        const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 4096);
+        timings.push(specResp);
+        const spec = extractJson(specResp.text);
+        if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
+        result.specOk = true;
+        result.specEntities = spec.entities.length;
+        writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
+
+        // 3. LLM-koodigenerointi
+        console.log(`    [3/5] Koodigenerointi (LLM)...`);
+        const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all 4 files. Follow the reference implementation patterns exactly.`;
+        result.promptChars = CODE_SYSTEM.length + codePrompt.length;
+        result.promptTokensEst = Math.round(result.promptChars / 4);
+        const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, 8192);
+        timings.push(codeResp);
+        writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
+        const files = parseGeneratedFiles(codeResp.text);
+        const required = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
+        const missing = required.filter(f => !files[f]);
+        if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
+
+        // 4. Validointi + korjaussilmukka
+        let issues = validateProjectCode(files);
+        let fixRound = 0;
+        while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
+            fixRound++;
+            console.log(`    [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
+            const issuesByFile = {};
+            for (const issue of issues) {
+                const m = issue.match(/^ISSUE:\s*(\S+?):/);
+                const fname = m ? m[1] : 'unknown';
+                if (!issuesByFile[fname]) issuesByFile[fname] = [];
+                issuesByFile[fname].push(issue);
+            }
+            for (const [fname, fIssues] of Object.entries(issuesByFile)) {
+                if (!files[fname]) continue;
+                const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
+                const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
+                timings.push(fixResp);
+                if (fixResp.text) {
+                    files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
+                }
+            }
+            issues = validateProjectCode(files);
+        }
+        result.validationIssues = issues.length;
+        result.fixRounds = fixRound;
+
+        // Kirjoita LLM:n generoimat Python-tiedostot
+        for (const [fn, content] of Object.entries(files)) {
+            if (fn.endsWith('.py')) writeFileSync(`${dir}/${fn}`, content);
+        }
+
+        // 5. Pytest Docker-kontissa (kipina-pytest image)
+        console.log(`    [5/5] Pytest (Docker)...`);
+        try {
+            const pytestOut = execSync(
+                `docker run --rm -v "${dir}:/src:ro" kipina-pytest 2>&1`,
+                { timeout: 120000, encoding: 'utf-8' }
+            );
+            writeFileSync(`${dir}/_pytest.txt`, pytestOut);
+            const passedMatch = pytestOut.match(/(\d+) passed/);
+            const failedMatch = pytestOut.match(/(\d+) failed/);
+            result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
+            result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0;
+            result.testsTotal = result.testsPassed + result.testsFailed;
+        } catch (e) {
+            const output = e.stdout || e.stderr || e.message || '';
+            writeFileSync(`${dir}/_pytest.txt`, output);
+            const passedMatch = output.match(/(\d+) passed/);
+            const failedMatch = output.match(/(\d+) failed/);
+            const errorMatch = output.match(/(\d+) error/);
+            result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
+            result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0);
+            result.testsTotal = result.testsPassed + result.testsFailed;
+            if (result.testsTotal === 0) result.error = 'Pytest kaatui';
+        }
+    } catch (e) {
+        result.error = e.message;
+    }
+
+    // Yhteenveto
+    result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
+    result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
+    result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
+    result.score = scoreResult(result);
+    result.stars = starsForScore(result.score);
+
+    return result;
+}
+
+// === Main ===
+async function main() {
+    console.log('╔══════════════════════════════════════════════╗');
+    console.log('║       Kipinä CodeBench                      ║');
+    console.log('╚══════════════════════════════════════════════╝');
+    console.log(`Ollama: ${OLLAMA_URL}`);
+
+    // Haetaan mallit
+    let models;
+    try {
+        models = await ollamaListModels();
+    } catch (e) {
+        console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
+        process.exit(1);
+    }
+
+    if (FILTER_MODELS) {
+        const filter = FILTER_MODELS.split(',').map(s => s.trim());
+        models = models.filter(m => filter.some(f => m.includes(f)));
+    }
+
+    console.log(`Mallit (${models.length}): ${models.join(', ')}`);
+
+    const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
+    console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
+    console.log(`Tulokset: ${OUTPUT_DIR}/`);
+    console.log('');
+
+    // Puhdista output
+    rmSync(OUTPUT_DIR, { recursive: true, force: true });
+    mkdirSync(OUTPUT_DIR, { recursive: true });
+
+    const results = [];
+
+    for (const model of models) {
+        for (const scenario of scenarios) {
+            console.log(`\n━━━ ${model} × ${scenario.id} ━━━`);
+            const r = await runPipeline(model, scenario);
+            results.push(r);
+
+            const status = r.error ? `✗ ${r.error}` :
+                r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
+                `◐ ${r.testsPassed}/${r.testsTotal}`;
+            const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
+            console.log(`    → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
+        }
+    }
+
+    // === Tulostaulu ===
+    console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
+    console.log('║                                    TULOKSET                                                     ║');
+    console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
+
+    const header = [
+        'Malli'.padEnd(40),
+        'Skenaario'.padEnd(10),
+        'Speksi'.padEnd(8),
+        'Testit'.padEnd(10),
+        'Korjaus'.padEnd(8),
+        'Ctx'.padEnd(7),
+        'Aika'.padEnd(8),
+        'tok/s'.padEnd(8),
+        'Pisteet',
+    ].join(' │ ');
+    console.log(`║ ${header} ║`);
+    console.log('╠' + '═'.repeat(header.length + 2) + '╣');
+
+    for (const r of results) {
+        const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗';
+        const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
+        const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
+        const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
+        const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
+        const speed = `${r.avgTokPerSec.toFixed(0)}`;
+        const row = [
+            r.model.padEnd(40),
+            r.scenario.padEnd(10),
+            specStatus.padEnd(8),
+            testStatus.padEnd(10),
+            fixStatus.padEnd(8),
+            ctx.padEnd(7),
+            time.padEnd(8),
+            speed.padEnd(8),
+            `${r.stars} ${r.score}`,
+        ].join(' │ ');
+        console.log(`║ ${row} ║`);
+    }
+    console.log('╚' + '═'.repeat(header.length + 2) + '╝');
+
+    // === Mallikohtainen yhteenveto ===
+    const modelNames = [...new Set(results.map(r => r.model))];
+    const scenarioIds = scenarios.map(s => s.id);
+
+    console.log('\n');
+    const mHeader = [
+        'Malli'.padEnd(35),
+        ...scenarioIds.map(s => s.padEnd(22)),
+        'Yht.'.padEnd(8),
+        'Out'.padEnd(7),
+        'Aika'.padEnd(8),
+        'tok/s'.padEnd(7),
+        'Pisteet',
+    ].join(' │ ');
+    console.log(mHeader);
+    console.log('─'.repeat(mHeader.length));
+
+    for (const model of modelNames) {
+        const mrs = results.filter(r => r.model === model);
+        const cols = scenarioIds.map(sid => {
+            const r = mrs.find(r => r.scenario === sid);
+            if (!r) return '-'.padEnd(22);
+            const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
+            const s = `${(r.totalDurationMs/1000).toFixed(0)}s`;
+            const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`;
+            return `${t} ${s} ${tok}`.padEnd(22);
+        });
+        const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0);
+        const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0);
+        const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0);
+        const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0);
+        const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0;
+        const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0;
+        const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0;
+        const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`;
+        const row = [
+            model.padEnd(35),
+            ...cols,
+            `${totalPassed}/${totalTests}`.padEnd(8),
+            tokStr.padEnd(7),
+            `${(totalTime/1000).toFixed(0)}s`.padEnd(8),
+            `${avgSpeed}`.padEnd(7),
+            `${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`,
+        ].join(' │ ');
+        console.log(row);
+    }
+
+    // Tallenna JSON + HTML-raportti
+    writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
+    const templatePath = join(__dirname, 'report-template.html');
+    if (existsSync(templatePath)) {
+        const html = readFileSync(templatePath, 'utf-8').replace(
+            '/*DATA_PLACEHOLDER*/[]',
+            JSON.stringify(results)
+        );
+        writeFileSync(`${OUTPUT_DIR}/report.html`, html);
+        console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`);
+    }
+    console.log(`JSON: ${OUTPUT_DIR}/results.json`);
+
+    // Yhteenveto
+    const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
+    const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
+    const failed = results.filter(r => r.error || r.testsTotal === 0);
+    const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
+    const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
+    console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
+}
+
+main().catch(e => { console.error(e); process.exit(1); });
--- a/kipina-codebench/golden-examples/DOCUMENTATION.md
+++ b/kipina-codebench/golden-examples/DOCUMENTATION.md
@@ -0,0 +1,84 @@
+# Dokumentointiohjeet — Zensical
+
+Hyvä dokumentointi kertoo **mitä asia ON**, ei mitä se tekee. Se on kuin zen-koan: lyhyt, tarkka, riittävä.
+
+## Periaatteet
+
+1. **Yksi rivi riittää.** Jos tarvitset kappaleen, koodi on liian monimutkainen.
+2. **Kerro mitä, älä miten.** `"""Tietokantamallit — SQLAlchemy 2.0, SQLite."""` ei `"""This module creates database models using SQLAlchemy..."""`
+3. **Älä toista koodia.** Jos funktio on `create_todo`, docstring ei ole "Creates a todo".
+4. **Suomi tai englanti, ei molempia.** Valitse yksi kieli per projekti.
+5. **Ei täytesanoja.** "This module provides functionality for" → poista.
+
+## Mitä dokumentoidaan
+
+| Kohde | Dokumentointi | Esimerkki |
+|-------|--------------|-----------|
+| **Moduuli** (.py) | Aina. Yksi rivi: mitä tiedosto sisältää. | `"""Pydantic v2 -skeemat — Create ja Response."""` |
+| **Luokka** | Aina. Mitä entiteetti edustaa. | `"""Tehtävä — otsikko, deadline, prioriteetti."""` |
+| **Funktio** | Vain jos nimi ei kerro kaikkea. | `get_db` → `"""Tietokantasessio per pyyntö."""` |
+| **CRUD-endpoint** | Ei. Nimi + HTTP-metodi riittää. | `create_todo`, `list_todos` — itsedokumentoivia |
+| **Testi** | Ei. Testin nimi on dokumentaatio. | `test_get_todo_not_found` — selvä |
+| **Konfiguraatio** | Kommentti vain jos arvo yllättää. | `check_same_thread: False  # SQLite + FastAPI` |
+
+## Mitä EI dokumentoida
+
+- Importteja
+- Ilmeisiä parametreja (`item_id: int`)
+- Tyyppivihjeitä jotka kertovat saman asian
+- Geneerisiä "boilerplate"-docstringejä
+
+## Esimerkkejä
+
+### Hyvä (zensical)
+
+```python
+"""Tietokantamallit — SQLAlchemy 2.0, Mapped-tyypitys, SQLite."""
+
+class Todo(Base):
+    """Tehtävä — otsikko, kuvaus, deadline, prioriteetti ja status."""
+    ...
+
+def get_db():
+    """Tietokantasessio per pyyntö."""
+    ...
+```
+
+### Huono (verbose)
+
+```python
+"""
+This module defines the database models for the Todo application.
+It uses SQLAlchemy ORM to create the database tables and provides
+the session factory for database connections.
+"""
+
+class Todo(Base):
+    """
+    Represents a todo item in the database.
+
+    Attributes:
+        id: The unique identifier for the todo item.
+        title: The title of the todo item.
+        ...
+    """
+    ...
+```
+
+### Huono (tyhjä)
+
+```python
+# Ei docstringejä ollenkaan — lukija ei tiedä mikä tiedoston rooli on
+class Todo(Base):
+    __tablename__ = "todos"
+    ...
+```
+
+## Tarkistuslista
+
+Generoitu koodi on hyvin dokumentoitu kun:
+- [ ] Jokainen .py-tiedosto alkaa yksirivisellä docstringillä
+- [ ] Jokainen luokka kertoo mitä entiteetti edustaa
+- [ ] Docstringit ovat saman kielen kuin muu koodi
+- [ ] CRUD-endpointeilla ei ole turhia docstringejä
+- [ ] Kommentteja on vain siellä missä koodi yllättää
--- a/kipina-codebench/golden-examples/README.md
+++ b/kipina-codebench/golden-examples/README.md
@@ -0,0 +1,123 @@
+# Golden Examples — referenssitoteutukset
+
+Kultaiset esimerkit ovat **täydellisiä, testattuja** FastAPI-projekteja joita LLM käyttää mallina koodigeneroinnissa. Malli näkee esimerkin ja tuottaa vastaavan rakenteen uudelle projektille.
+
+## Uuden esimerkin luominen
+
+### 1. Luo hakemisto
+
+```bash
+mkdir golden-examples/shop
+```
+
+Nimeä hakemisto skenaarion mukaan (todo, blog, shop, booking...).
+
+### 2. Luo 4 tiedostoa
+
+| Tiedosto | Sisältö |
+|----------|---------|
+| `models.py` | SQLAlchemy 2.0 -mallit (DeclarativeBase, Mapped, mapped_column) |
+| `schemas.py` | Pydantic v2 -skeemat (ConfigDict, `str \| None` -syntaksi) |
+| `main.py` | FastAPI CRUD -endpointit (POST 201, GET, GET/:id 404, PUT, DELETE 204) |
+| `test_main.py` | Pytest + TestClient, erillinen test.db, uniikki data per testi |
+
+### 3. Noudata konventioita
+
+**Python-versio:** >=3.14
+
+**SQLAlchemy 2.0** (ei legacy):
+```python
+# Oikein
+class Base(DeclarativeBase):
+    pass
+
+class Todo(Base):
+    id: Mapped[int] = mapped_column(primary_key=True, index=True)
+    title: Mapped[str] = mapped_column(String(255))
+    status: Mapped[str] = mapped_column(String(20), default="pending")
+
+# Väärin
+Base = declarative_base()
+id = Column(Integer, primary_key=True)
+```
+
+**Pydantic v2** (ei v1):
+```python
+# Oikein
+class TodoResponse(TodoCreate):
+    id: int
+    model_config = ConfigDict(from_attributes=True)
+
+# Väärin
+class Config:
+    orm_mode = True
+```
+
+**Tyypitys:**
+```python
+# Oikein
+description: Mapped[str | None] = mapped_column(Text, default=None)
+
+# Väärin
+description: Mapped[Optional[str]]
+```
+
+**Dokumentointi (zensical):**
+```python
+"""Tietokantamallit — SQLAlchemy 2.0, Mapped-tyypitys, SQLite."""
+
+class Todo(Base):
+    """Tehtävä — otsikko, kuvaus, deadline, prioriteetti ja status."""
+```
+
+Yksi rivi riittää. Kerro mitä asia ON, älä mitä se tekee. Katso [DOCUMENTATION.md](DOCUMENTATION.md).
+
+**Testidata — uniikki ja kuvaava:**
+```python
+# Oikein
+def test_create_todo():
+    response = client.post("/todos/", json={"title": "Osta maitoa", "priority": 2})
+
+def test_update_todo():
+    created = client.post("/todos/", json={"title": "Vanha otsikko"}).json()
+
+# Väärin — geneerinen data
+def test_create_todo():
+    response = client.post("/todos/", json={"title": "test", "priority": 1})
+```
+
+### 4. Testaa Docker-kontissa
+
+```bash
+rm -rf /tmp/golden-test && mkdir /tmp/golden-test
+cp golden-examples/shop/*.py /tmp/golden-test/
+docker run --rm -v /tmp/golden-test:/src:ro kipina-pytest
+```
+
+**Kaikkien testien pitää mennä läpi.** Ei varoituksia, ei deprecation-viestejä.
+
+### 5. Vaikeustasot
+
+| Taso | Esimerkit | Haaste |
+|------|-----------|--------|
+| 1 — Perus-CRUD | `todo/`, `users/`, `notes/` | Yksi entiteetti |
+| 2 — Relaatiot | `blog/`, `library/`, `school/` | Foreign key, 2–3 entiteettiä |
+| 3 — Liiketoimintalogiikka | `shop/`, `booking/` | Custom endpointit, validointi |
+
+Aloita tasosta 1 ja etene. Tason 1 esimerkkien pitää olla yksinkertaisia — ne opettavat mallille perusrakenteen.
+
+## Miten esimerkit vaikuttavat
+
+Benchmark lataa `todo/`-esimerkin ja syöttää sen LLM:lle osana koodingenerointipromptia:
+
+```
+REFERENCE IMPLEMENTATION (todo project — follow this exact structure):
+
+=== models.py ===
+<todo/models.py sisältö>
+
+=== schemas.py ===
+...
+```
+
+Malli näkee tarkan esimerkin ja tuottaa vastaavan rakenteen uudelle projektille. Mitä parempi esimerkki, sitä parempi tulos.
--- a/kipina-codebench/golden-examples/blog/main.py
+++ b/kipina-codebench/golden-examples/blog/main.py
@@ -0,0 +1,110 @@
+"""FastAPI CRUD — kaksi endpoint-settiä, Author ja Post."""
+
+from fastapi import FastAPI, Depends, HTTPException
+from sqlalchemy.orm import Session
+
+from models import SessionLocal, Author, Post
+from schemas import AuthorCreate, AuthorResponse, PostCreate, PostResponse
+
+app = FastAPI()
+
+
+def get_db():
+    """Tietokantasessio per pyyntö."""
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+# --- Author ---
+
+
+@app.post("/authors/", response_model=AuthorResponse, status_code=201)
+def create_author(item: AuthorCreate, db: Session = Depends(get_db)):
+    db_item = Author(**item.model_dump())
+    db.add(db_item)
+    db.commit()
+    db.refresh(db_item)
+    return db_item
+
+
+@app.get("/authors/", response_model=list[AuthorResponse])
+def list_authors(db: Session = Depends(get_db)):
+    return db.query(Author).all()
+
+
+@app.get("/authors/{item_id}", response_model=AuthorResponse)
+def get_author(item_id: int, db: Session = Depends(get_db)):
+    item = db.query(Author).filter(Author.id == item_id).first()
+    if not item:
+        raise HTTPException(status_code=404, detail="Author not found")
+    return item
+
+
+@app.put("/authors/{item_id}", response_model=AuthorResponse)
+def update_author(item_id: int, item: AuthorCreate, db: Session = Depends(get_db)):
+    db_item = db.query(Author).filter(Author.id == item_id).first()
+    if not db_item:
+        raise HTTPException(status_code=404, detail="Author not found")
+    for key, value in item.model_dump().items():
+        setattr(db_item, key, value)
+    db.commit()
+    db.refresh(db_item)
+    return db_item
+
+
+@app.delete("/authors/{item_id}", status_code=204)
+def delete_author(item_id: int, db: Session = Depends(get_db)):
+    db_item = db.query(Author).filter(Author.id == item_id).first()
+    if not db_item:
+        raise HTTPException(status_code=404, detail="Author not found")
+    db.delete(db_item)
+    db.commit()
+
+
+# --- Post ---
+
+
+@app.post("/posts/", response_model=PostResponse, status_code=201)
+def create_post(item: PostCreate, db: Session = Depends(get_db)):
+    db_item = Post(**item.model_dump())
+    db.add(db_item)
+    db.commit()
+    db.refresh(db_item)
+    return db_item
+
+
+@app.get("/posts/", response_model=list[PostResponse])
+def list_posts(db: Session = Depends(get_db)):
+    return db.query(Post).all()
+
+
+@app.get("/posts/{item_id}", response_model=PostResponse)
+def get_post(item_id: int, db: Session = Depends(get_db)):
+    item = db.query(Post).filter(Post.id == item_id).first()
+    if not item:
+        raise HTTPException(status_code=404, detail="Post not found")
+    return item
+
+
+@app.put("/posts/{item_id}", response_model=PostResponse)
+def update_post(item_id: int, item: PostCreate, db: Session = Depends(get_db)):
+    db_item = db.query(Post).filter(Post.id == item_id).first()
+    if not db_item:
+        raise HTTPException(status_code=404, detail="Post not found")
+    for key, value in item.model_dump().items():
+        setattr(db_item, key, value)
+    db.commit()
+    db.refresh(db_item)
+    return db_item
+
+
+@app.delete("/posts/{item_id}", status_code=204)
+def delete_post(item_id: int, db: Session = Depends(get_db)):
+    db_item = db.query(Post).filter(Post.id == item_id).first()
+    if not db_item:
+        raise HTTPException(status_code=404, detail="Post not found")
+    db.delete(db_item)
+    db.commit()
--- a/kipina-codebench/golden-examples/blog/models.py
+++ b/kipina-codebench/golden-examples/blog/models.py
@@ -0,0 +1,45 @@
+"""Tietokantamallit — SQLAlchemy 2.0, Mapped-tyypitys, ForeignKey-relaatiot."""
+
+from datetime import datetime
+
+from sqlalchemy import String, Text, DateTime, ForeignKey, create_engine
+from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, sessionmaker
+
+DATABASE_URL = "sqlite:///./app.db"
+engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+
+class Base(DeclarativeBase):
+    pass
+
+
+class Author(Base):
+    """Kirjoittaja — nimi, sähköposti ja bio."""
+
+    __tablename__ = "authors"
+
+    id: Mapped[int] = mapped_column(primary_key=True, index=True)
+    name: Mapped[str] = mapped_column(String(255))
+    email: Mapped[str] = mapped_column(String(255), unique=True)
+    bio: Mapped[str | None] = mapped_column(Text, default=None)
+
+    posts: Mapped[list["Post"]] = relationship(back_populates="author")
+
+
+class Post(Base):
+    """Blogipostaus — otsikko, sisältö, kirjoittaja, julkaisuaika ja tila."""
+
+    __tablename__ = "posts"
+
+    id: Mapped[int] = mapped_column(primary_key=True, index=True)
+    title: Mapped[str] = mapped_column(String(255))
+    content: Mapped[str] = mapped_column(Text)
+    author_id: Mapped[int] = mapped_column(ForeignKey("authors.id"))
+    published_at: Mapped[datetime | None] = mapped_column(DateTime, default=None)
+    status: Mapped[str] = mapped_column(String(20), default="draft")
+
+    author: Mapped["Author"] = relationship(back_populates="posts")
+
+
+Base.metadata.create_all(bind=engine)
--- a/kipina-codebench/golden-examples/blog/schemas.py
+++ b/kipina-codebench/golden-examples/blog/schemas.py
@@ -0,0 +1,37 @@
+"""Pydantic v2 -skeemat — Create sisääntulolle, Response vastaukselle."""
+
+from datetime import datetime
+
+from pydantic import BaseModel, ConfigDict
+
+
+class AuthorCreate(BaseModel):
+    """Uuden kirjoittajan luonti. Pakolliset: name, email."""
+
+    name: str
+    email: str
+    bio: str | None = None
+
+
+class AuthorResponse(AuthorCreate):
+    """Palautettava kirjoittaja — sisältää id:n."""
+
+    id: int
+    model_config = ConfigDict(from_attributes=True)
+
+
+class PostCreate(BaseModel):
+    """Uuden postauksen luonti. Pakolliset: title, content, author_id."""
+
+    title: str
+    content: str
+    author_id: int
+    published_at: datetime | None = None
+    status: str = "draft"
+
+
+class PostResponse(PostCreate):
+    """Palautettava postaus — sisältää id:n."""
+
+    id: int
+    model_config = ConfigDict(from_attributes=True)
--- a/kipina-codebench/golden-examples/blog/test_main.py
+++ b/kipina-codebench/golden-examples/blog/test_main.py
@@ -0,0 +1,164 @@
+"""Pytest — TestClient, erillinen test.db, uniikki data per testi."""
+
+from fastapi.testclient import TestClient
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+from main import app, get_db
+from models import Base
+
+test_engine = create_engine(
+    "sqlite:///./test.db", connect_args={"check_same_thread": False}
+)
+TestSession = sessionmaker(autocommit=False, autoflush=False, bind=test_engine)
+Base.metadata.create_all(bind=test_engine)
+
+
+def override_get_db():
+    db = TestSession()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+app.dependency_overrides[get_db] = override_get_db
+client = TestClient(app)
+
+
+def _create_author(name="Eino Leino", email=None):
+    """Apufunktio kirjoittajan luomiseen testeissä."""
+    if email is None:
+        email = f"{name.lower().replace(' ', '.')}@example.com"
+    return client.post(
+        "/authors/", json={"name": name, "email": email}
+    ).json()
+
+
+# --- Author-testit ---
+
+
+def test_create_author():
+    response = client.post(
+        "/authors/",
+        json={"name": "Aleksis Kivi", "email": "aleksis@example.com", "bio": "Suomen kansalliskirjailija"},
+    )
+    assert response.status_code == 201
+    assert response.json()["name"] == "Aleksis Kivi"
+    assert response.json()["bio"] == "Suomen kansalliskirjailija"
+    assert "id" in response.json()
+
+
+def test_list_authors():
+    _create_author("Minna Canth", "minna.canth@example.com")
+    response = client.get("/authors/")
+    assert response.status_code == 200
+    assert len(response.json()) >= 1
+
+
+def test_get_author_by_id():
+    created = _create_author("Väinö Linna", "vaino.linna@example.com")
+    response = client.get(f"/authors/{created['id']}")
+    assert response.status_code == 200
+    assert response.json()["id"] == created["id"]
+
+
+def test_get_author_not_found():
+    response = client.get("/authors/99999")
+    assert response.status_code == 404
+
+
+def test_update_author():
+    created = _create_author("Vanha Nimi", "vanha.nimi@example.com")
+    response = client.put(
+        f"/authors/{created['id']}",
+        json={"name": "Uusi Nimi", "email": "uusi.nimi@example.com"},
+    )
+    assert response.status_code == 200
+    assert response.json()["name"] == "Uusi Nimi"
+
+
+def test_delete_author():
+    created = _create_author("Poistettava Kirjailija", "poistettava@example.com")
+    response = client.delete(f"/authors/{created['id']}")
+    assert response.status_code == 204
+    response = client.get(f"/authors/{created['id']}")
+    assert response.status_code == 404
+
+
+# --- Post-testit ---
+
+
+def test_create_post():
+    author = _create_author("Tove Jansson", "tove.jansson@example.com")
+    response = client.post(
+        "/posts/",
+        json={"title": "Muumipeikko ja pyrstötähti", "content": "Eräänä aamuna...", "author_id": author["id"]},
+    )
+    assert response.status_code == 201
+    assert response.json()["title"] == "Muumipeikko ja pyrstötähti"
+    assert response.json()["author_id"] == author["id"]
+    assert response.json()["status"] == "draft"
+
+
+def test_list_posts():
+    author = _create_author("Juhani Aho", "juhani.aho@example.com")
+    client.post(
+        "/posts/",
+        json={"title": "Rautatie", "content": "Junasta kertova novelli.", "author_id": author["id"]},
+    )
+    response = client.get("/posts/")
+    assert response.status_code == 200
+    assert len(response.json()) >= 1
+
+
+def test_get_post_by_id():
+    author = _create_author("Elias Lönnrot", "elias.lonnrot@example.com")
+    created = client.post(
+        "/posts/",
+        json={"title": "Kalevala", "content": "Vaka vanha Väinämöinen.", "author_id": author["id"]},
+    ).json()
+    response = client.get(f"/posts/{created['id']}")
+    assert response.status_code == 200
+    assert response.json()["id"] == created["id"]
+
+
+def test_get_post_not_found():
+    response = client.get("/posts/99999")
+    assert response.status_code == 404
+
+
+def test_update_post():
+    author = _create_author("Joel Lehtonen", "joel.lehtonen@example.com")
+    created = client.post(
+        "/posts/",
+        json={"title": "Vanha otsikko", "content": "Alkuperäinen teksti.", "author_id": author["id"]},
+    ).json()
+    response = client.put(
+        f"/posts/{created['id']}",
+        json={"title": "Päivitetty otsikko", "content": "Muokattu teksti.", "author_id": author["id"], "status": "published"},
+    )
+    assert response.status_code == 200
+    assert response.json()["title"] == "Päivitetty otsikko"
+    assert response.json()["status"] == "published"
+
+
+def test_delete_post():
+    author = _create_author("Aino Kallas", "aino.kallas@example.com")
+    created = client.post(
+        "/posts/",
+        json={"title": "Poistettava postaus", "content": "Tämä poistetaan.", "author_id": author["id"]},
+    ).json()
+    response = client.delete(f"/posts/{created['id']}")
+    assert response.status_code == 204
+    response = client.get(f"/posts/{created['id']}")
+    assert response.status_code == 404
+
+
+def test_post_belongs_to_author():
+    author = _create_author("Sofi Oksanen", "sofi.oksanen@example.com")
+    post = client.post(
+        "/posts/",
+        json={"title": "Puhdistus", "content": "Romaani Virosta.", "author_id": author["id"]},
+    ).json()
+    assert post["author_id"] == author["id"]
--- a/kipina-codebench/golden-examples/todo/main.py
+++ b/kipina-codebench/golden-examples/todo/main.py
@@ -0,0 +1,61 @@
+"""FastAPI CRUD — yksi endpoint-setti per entiteetti."""
+
+from fastapi import FastAPI, Depends, HTTPException
+from sqlalchemy.orm import Session
+
+from models import SessionLocal, Todo
+from schemas import TodoCreate, TodoResponse
+
+app = FastAPI()
+
+
+def get_db():
+    """Tietokantasessio per pyyntö."""
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+@app.post("/todos/", response_model=TodoResponse, status_code=201)
+def create_todo(item: TodoCreate, db: Session = Depends(get_db)):
+    db_item = Todo(**item.model_dump())
+    db.add(db_item)
+    db.commit()
+    db.refresh(db_item)
+    return db_item
+
+
+@app.get("/todos/", response_model=list[TodoResponse])
+def list_todos(db: Session = Depends(get_db)):
+    return db.query(Todo).all()
+
+
+@app.get("/todos/{item_id}", response_model=TodoResponse)
+def get_todo(item_id: int, db: Session = Depends(get_db)):
+    item = db.query(Todo).filter(Todo.id == item_id).first()
+    if not item:
+        raise HTTPException(status_code=404, detail="Todo not found")
+    return item
+
+
+@app.put("/todos/{item_id}", response_model=TodoResponse)
+def update_todo(item_id: int, item: TodoCreate, db: Session = Depends(get_db)):
+    db_item = db.query(Todo).filter(Todo.id == item_id).first()
+    if not db_item:
+        raise HTTPException(status_code=404, detail="Todo not found")
+    for key, value in item.model_dump().items():
+        setattr(db_item, key, value)
+    db.commit()
+    db.refresh(db_item)
+    return db_item
+
+
+@app.delete("/todos/{item_id}", status_code=204)
+def delete_todo(item_id: int, db: Session = Depends(get_db)):
+    db_item = db.query(Todo).filter(Todo.id == item_id).first()
+    if not db_item:
+        raise HTTPException(status_code=404, detail="Todo not found")
+    db.delete(db_item)
+    db.commit()
--- a/kipina-codebench/golden-examples/todo/models.py
+++ b/kipina-codebench/golden-examples/todo/models.py
@@ -0,0 +1,30 @@
+"""Tietokantamallit — SQLAlchemy 2.0, Mapped-tyypitys, SQLite."""
+
+from datetime import date
+
+from sqlalchemy import String, Text, Date, create_engine
+from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, sessionmaker
+
+DATABASE_URL = "sqlite:///./app.db"
+engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+
+class Base(DeclarativeBase):
+    pass
+
+
+class Todo(Base):
+    """Tehtävä — otsikko, kuvaus, deadline, prioriteetti ja status."""
+
+    __tablename__ = "todos"
+
+    id: Mapped[int] = mapped_column(primary_key=True, index=True)
+    title: Mapped[str] = mapped_column(String(255))
+    description: Mapped[str | None] = mapped_column(Text, default=None)
+    due_date: Mapped[date | None] = mapped_column(Date, default=None)
+    priority: Mapped[int] = mapped_column(default=1)
+    status: Mapped[str] = mapped_column(String(20), default="pending")
+
+
+Base.metadata.create_all(bind=engine)
--- a/kipina-codebench/golden-examples/todo/pyproject.toml
+++ b/kipina-codebench/golden-examples/todo/pyproject.toml
@@ -0,0 +1,11 @@
+[project]
+name = "todo-app"
+version = "0.1.0"
+requires-python = ">=3.14"
+dependencies = [
+    "fastapi",
+    "uvicorn[standard]",
+    "sqlalchemy",
+    "pytest",
+    "httpx",
+]
--- a/kipina-codebench/golden-examples/todo/schemas.py
+++ b/kipina-codebench/golden-examples/todo/schemas.py
@@ -0,0 +1,22 @@
+"""Pydantic v2 -skeemat — Create sisääntulolle, Response vastaukselle."""
+
+from datetime import date
+
+from pydantic import BaseModel, ConfigDict
+
+
+class TodoCreate(BaseModel):
+    """Uuden tehtävän luonti. Pakolliset: title."""
+
+    title: str
+    description: str | None = None
+    due_date: date | None = None
+    priority: int = 1
+    status: str = "pending"
+
+
+class TodoResponse(TodoCreate):
+    """Palautettava tehtävä — sisältää id:n."""
+
+    id: int
+    model_config = ConfigDict(from_attributes=True)
--- a/kipina-codebench/golden-examples/todo/test_main.py
+++ b/kipina-codebench/golden-examples/todo/test_main.py
@@ -0,0 +1,69 @@
+"""Pytest — TestClient, erillinen test.db, uniikki data per testi."""
+
+from fastapi.testclient import TestClient
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+from main import app, get_db
+from models import Base
+
+test_engine = create_engine(
+    "sqlite:///./test.db", connect_args={"check_same_thread": False}
+)
+TestSession = sessionmaker(autocommit=False, autoflush=False, bind=test_engine)
+Base.metadata.create_all(bind=test_engine)
+
+
+def override_get_db():
+    db = TestSession()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+app.dependency_overrides[get_db] = override_get_db
+client = TestClient(app)
+
+
+def test_create_todo():
+    response = client.post("/todos/", json={"title": "Osta maitoa", "priority": 2})
+    assert response.status_code == 201
+    assert response.json()["title"] == "Osta maitoa"
+    assert "id" in response.json()
+
+
+def test_list_todos():
+    client.post("/todos/", json={"title": "Listattava tehtävä"})
+    response = client.get("/todos/")
+    assert response.status_code == 200
+    assert len(response.json()) >= 1
+
+
+def test_get_todo_by_id():
+    created = client.post("/todos/", json={"title": "Haettava tehtävä"}).json()
+    response = client.get(f"/todos/{created['id']}")
+    assert response.status_code == 200
+    assert response.json()["id"] == created["id"]
+
+
+def test_get_todo_not_found():
+    response = client.get("/todos/99999")
+    assert response.status_code == 404
+
+
+def test_update_todo():
+    created = client.post("/todos/", json={"title": "Vanha otsikko"}).json()
+    response = client.put(
+        f"/todos/{created['id']}", json={"title": "Uusi otsikko"}
+    )
+    assert response.status_code == 200
+    assert response.json()["title"] == "Uusi otsikko"
+
+
+def test_delete_todo():
+    created = client.post("/todos/", json={"title": "Poistettava"}).json()
+    response = client.delete(f"/todos/{created['id']}")
+    assert response.status_code == 204
+    response = client.get(f"/todos/{created['id']}")
+    assert response.status_code == 404
--- a/kipina-codebench/package.json
+++ b/kipina-codebench/package.json
@@ -0,0 +1,13 @@
+{
+  "name": "kipina-codebench",
+  "version": "0.1.0",
+  "description": "LLM-koodingenerointibenchmark — testaa Ollama-mallien kykyä generoida toimivia FastAPI-projekteja",
+  "type": "module",
+  "bin": {
+    "codebench": "./benchmark.mjs"
+  },
+  "scripts": {
+    "bench": "node benchmark.mjs --scenarios all",
+    "docker:build": "docker build -t kipina-pytest -f Dockerfile.pytest ."
+  }
+}
--- a/kipina-codebench/prompts/client.md
+++ b/kipina-codebench/prompts/client.md
@@ -0,0 +1,15 @@
+You are a product owner who turns vague ideas into clear, actionable software requirements.
+
+GIVEN a short project description from the user, produce a structured brief:
+
+1. PROJECT NAME: a short, descriptive name
+2. GOAL: one sentence explaining what the software does and who it's for
+3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes)
+4. DATA MODEL: list the main entities and their key fields (include field types)
+5. API ENDPOINTS: list the REST endpoints (method + path + purpose)
+6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed")
+
+RULES:
+- Be specific: "User can filter todos by status" not "todo management"
+- Use plain English, no code
+- Maximum 400 words total
--- a/kipina-codebench/prompts/code.md
+++ b/kipina-codebench/prompts/code.md
@@ -0,0 +1,36 @@
+You are a Python backend developer. Generate a FastAPI project with SQLAlchemy and SQLite.
+
+Given the project requirements, JSON specification, and a REFERENCE IMPLEMENTATION, generate these 4 files:
+
+1. models.py — SQLAlchemy 2.0: DeclarativeBase, Mapped, mapped_column (NOT legacy declarative_base)
+2. schemas.py — Pydantic v2: ConfigDict(from_attributes=True) (NOT class Config)
+3. main.py — FastAPI CRUD endpoints for each entity
+4. test_main.py — Pytest with TestClient, separate test.db, unique test data per test
+
+Do NOT generate pyproject.toml — it is created separately with uv.
+
+OUTPUT FORMAT — use these exact markers to separate files:
+
+=== models.py ===
+<python code>
+
+=== schemas.py ===
+<python code>
+
+=== main.py ===
+<python code>
+
+=== test_main.py ===
+<python code>
+
+DOCUMENTATION — every file must have a one-line module docstring. Classes get a one-line docstring. Keep it zensical: say what it IS, not what it does. No filler.
+
+RULES:
+- Follow the REFERENCE IMPLEMENTATION patterns exactly
+- SQLAlchemy 2.0: DeclarativeBase + Mapped + mapped_column (not Column())
+- Python type unions: str | None (not Optional[str])
+- Tests: unique descriptive data per test, NOT generic "test_title" strings
+- Tests: PUT/update test data MUST include ALL required (non-nullable) fields, not just the field being updated
+- Absolute imports only (from models import ..., from schemas import ...)
+- NO markdown fences inside file content — just raw code
+- Only test endpoints that exist in main.py — no extra tests
--- a/kipina-codebench/prompts/fix.md
+++ b/kipina-codebench/prompts/fix.md
@@ -0,0 +1 @@
+You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.
--- a/kipina-codebench/prompts/spec.md
+++ b/kipina-codebench/prompts/spec.md
@@ -0,0 +1,31 @@
+You are a software architect who designs database schemas for Python web applications.
+
+THINK STEP BY STEP before outputting JSON:
+1. What are the main ENTITIES (nouns) in this project?
+2. What FIELDS does each entity need? (name, type, required?)
+3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id)
+4. Are there Date/DateTime fields? → add extra_imports
+
+Then output ONLY valid JSON (no explanations before or after).
+
+SCHEMA:
+{"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]}
+
+FIELD RULES:
+- sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float
+- py_type: str, int, float, bool, date, datetime — append " | None" if nullable
+- Status fields: use String(20) with default value, NEVER Enum
+- Every entity gets "id" automatically — do NOT add id or redundant ID fields
+- Use snake_case for field names
+
+RELATIONSHIP RULES:
+- If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry
+- EVERY _id field MUST have a matching relationship entry
+- Parent entities must appear BEFORE children in the entities array
+- If no relationships, set "relationships": []
+
+AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish)
+
+EXAMPLES (adapt, don't copy):
+Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending")
+Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")
--- a/kipina-codebench/report-template.html
+++ b/kipina-codebench/report-template.html
@@ -0,0 +1,183 @@
+<!DOCTYPE html>
+<html lang="fi">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Kipina Model Benchmark</title>
+<style>
+  :root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
+  h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
+  .meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
+  .cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
+  .card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
+  .card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
+  .card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
+  .card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
+  table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
+  th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
+  th:hover { color: var(--text); }
+  th.sorted-asc::after { content: ' ▲'; }
+  th.sorted-desc::after { content: ' ▼'; }
+  td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
+  tr:hover td { background: #1c2128; }
+  .pass { color: var(--green); }
+  .partial { color: var(--yellow); }
+  .fail { color: var(--red); }
+  .stars { letter-spacing: 1px; }
+  .bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
+  .bar-bg { background: var(--border); }
+  .bar-fill { background: var(--green); }
+  .bar-partial { background: var(--yellow); }
+  .model-name { font-weight: 600; }
+  h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
+  .summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
+</style>
+</head>
+<body>
+
+<h1>Kipina Model Benchmark</h1>
+<div class="meta" id="meta"></div>
+
+<div class="cards" id="cards"></div>
+
+<h2>Mallikohtainen yhteenveto</h2>
+<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
+
+<h2>Kaikki tulokset</h2>
+<table id="results-table"><thead></thead><tbody></tbody></table>
+
+<script>
+const RAW = /*DATA_PLACEHOLDER*/[];
+
+const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
+function calcScore(r) {
+  if (r.error && r.testsTotal === 0) return 0;
+  let s = 0;
+  if (r.specOk) s += 10;
+  if (!r.error || r.testsTotal > 0) s += 10;
+  if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
+  s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
+  return Math.min(100, s);
+}
+// Laske pisteet jos puuttuvat
+const DATA = RAW.map(r => {
+  if (r.score == null) r.score = calcScore(r);
+  if (!r.stars) r.stars = starsFor(r.score);
+  if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
+  return r;
+});
+const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
+const pctBar = (passed, total, w=80) => {
+  if (total === 0) return '-';
+  const pct = passed/total*100;
+  const c = pct === 100 ? 'bar-fill' : 'bar-partial';
+  return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
+};
+
+// Meta
+const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
+document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
+
+// Cards
+const models = [...new Set(DATA.map(r => r.model))];
+const scenarios = [...new Set(DATA.map(r => r.scenario))];
+const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
+const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
+const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
+const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
+const bestModel = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
+}).sort((a,b) => b.avg - a.avg)[0];
+const fastestModel = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
+}).sort((a,b) => b.speed - a.speed)[0];
+
+document.getElementById('cards').innerHTML = `
+  <div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
+  <div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
+  <div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
+  <div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
+  <div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
+  <div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
+`;
+
+// Summary table
+const sumHead = document.querySelector('#summary-table thead');
+const sumBody = document.querySelector('#summary-table tbody');
+sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
+
+const modelRows = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
+  const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
+  const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
+  const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
+  const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
+  const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
+  const scenCols = scenarios.map(s => {
+    const r = mrs.find(r => r.scenario === s);
+    if (!r) return '<td>-</td>';
+    return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
+  }).join('');
+  return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
+}).sort((a,b) => b.avg - a.avg);
+sumBody.innerHTML = modelRows.map(r => r.html).join('');
+
+// Results table
+const resHead = document.querySelector('#results-table thead');
+const resBody = document.querySelector('#results-table tbody');
+const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
+resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
+
+let sortCol = 9, sortAsc = false;
+function renderResults() {
+  const sorted = [...DATA].sort((a,b) => {
+    const vals = [
+      [a.model, b.model],
+      [a.scenario, b.scenario],
+      [a.specEntities, b.specEntities],
+      [a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
+      [a.fixRounds, b.fixRounds],
+      [a.promptTokensEst, b.promptTokensEst],
+      [a.totalTokens, b.totalTokens],
+      [a.totalDurationMs, b.totalDurationMs],
+      [a.avgTokPerSec, b.avgTokPerSec],
+      [a.score, b.score],
+    ][sortCol];
+    const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
+    return sortAsc ? cmp : -cmp;
+  });
+  resBody.innerHTML = sorted.map(r => {
+    const c = cls(r);
+    return `<tr>
+      <td class="model-name">${r.model}</td>
+      <td>${r.scenario}</td>
+      <td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
+      <td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
+      <td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
+      <td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
+      <td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
+      <td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
+      <td>${r.avgTokPerSec.toFixed(0)}</td>
+      <td><span class="stars">${r.stars}</span> ${r.score}p</td>
+    </tr>`;
+  }).join('');
+  document.querySelectorAll('#results-table th').forEach((th,i) => {
+    th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
+  });
+}
+document.querySelector('#results-table thead').addEventListener('click', e => {
+  const col = parseInt(e.target.dataset.col);
+  if (isNaN(col)) return;
+  if (sortCol === col) sortAsc = !sortAsc;
+  else { sortCol = col; sortAsc = false; }
+  renderResults();
+});
+renderResults();
+</script>
+</body>
+</html>
--- a/kipina-codebench/results/2026-04-14_mistral.html
+++ b/kipina-codebench/results/2026-04-14_mistral.html
@@ -0,0 +1,183 @@
+<!DOCTYPE html>
+<html lang="fi">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Kipina Model Benchmark</title>
+<style>
+  :root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
+  h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
+  .meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
+  .cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
+  .card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
+  .card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
+  .card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
+  .card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
+  table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
+  th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
+  th:hover { color: var(--text); }
+  th.sorted-asc::after { content: ' ▲'; }
+  th.sorted-desc::after { content: ' ▼'; }
+  td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
+  tr:hover td { background: #1c2128; }
+  .pass { color: var(--green); }
+  .partial { color: var(--yellow); }
+  .fail { color: var(--red); }
+  .stars { letter-spacing: 1px; }
+  .bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
+  .bar-bg { background: var(--border); }
+  .bar-fill { background: var(--green); }
+  .bar-partial { background: var(--yellow); }
+  .model-name { font-weight: 600; }
+  h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
+  .summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
+</style>
+</head>
+<body>
+
+<h1>Kipina Model Benchmark</h1>
+<div class="meta" id="meta"></div>
+
+<div class="cards" id="cards"></div>
+
+<h2>Mallikohtainen yhteenveto</h2>
+<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
+
+<h2>Kaikki tulokset</h2>
+<table id="results-table"><thead></thead><tbody></tbody></table>
+
+<script>
+const RAW = [{"model":"codestral:22b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":63028,"totalTokens":2390,"avgTokPerSec":44.09843659433429,"promptChars":9567,"promptTokensEst":2392,"score":100,"stars":"★★★★★","error":null},{"model":"codestral:22b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":4,"testsPassed":4,"testsFailed":0,"totalDurationMs":58359,"totalTokens":2313,"avgTokPerSec":44.04431775388366,"promptChars":9641,"promptTokensEst":2410,"score":100,"stars":"★★★★★","error":null},{"model":"codestral:22b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":52020,"totalTokens":2073,"avgTokPerSec":44.03716103774298,"promptChars":10007,"promptTokensEst":2502,"score":40,"stars":"★★☆☆☆","error":null},{"model":"mistral-small3.1:24b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":7,"testsPassed":6,"testsFailed":1,"totalDurationMs":76602,"totalTokens":2820,"avgTokPerSec":41.65340751865168,"promptChars":10816,"promptTokensEst":2704,"score":91,"stars":"★★★★★","error":null},{"model":"mistral-small3.1:24b","scenario":"users","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":0,"testsPassed":0,"testsFailed":0,"totalDurationMs":0,"totalTokens":0,"avgTokPerSec":0,"promptChars":11004,"promptTokensEst":2751,"score":0,"stars":"","error":"Puuttuvat: test_main.py"},{"model":"mistral-small3.1:24b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":0,"testsPassed":0,"testsFailed":0,"totalDurationMs":0,"totalTokens":0,"avgTokPerSec":0,"promptChars":10573,"promptTokensEst":2643,"score":0,"stars":"","error":"Puuttuvat: test_main.py"},{"model":"devstral:24b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":54454,"totalTokens":1952,"avgTokPerSec":42.767057828688735,"promptChars":9829,"promptTokensEst":2457,"score":40,"stars":"★★☆☆☆","error":null},{"model":"devstral:24b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":5,"testsPassed":1,"testsFailed":4,"totalDurationMs":50447,"totalTokens":1954,"avgTokPerSec":42.79877112859477,"promptChars":9678,"promptTokensEst":2420,"score":52,"stars":"★★★☆☆","error":null},{"model":"devstral:24b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":83061,"totalTokens":3251,"avgTokPerSec":42.647732012717476,"promptChars":10561,"promptTokensEst":2640,"score":40,"stars":"★★☆☆☆","error":null}];
+
+const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
+function calcScore(r) {
+  if (r.error && r.testsTotal === 0) return 0;
+  let s = 0;
+  if (r.specOk) s += 10;
+  if (!r.error || r.testsTotal > 0) s += 10;
+  if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
+  s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
+  return Math.min(100, s);
+}
+// Laske pisteet jos puuttuvat
+const DATA = RAW.map(r => {
+  if (r.score == null) r.score = calcScore(r);
+  if (!r.stars) r.stars = starsFor(r.score);
+  if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
+  return r;
+});
+const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
+const pctBar = (passed, total, w=80) => {
+  if (total === 0) return '-';
+  const pct = passed/total*100;
+  const c = pct === 100 ? 'bar-fill' : 'bar-partial';
+  return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
+};
+
+// Meta
+const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
+document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
+
+// Cards
+const models = [...new Set(DATA.map(r => r.model))];
+const scenarios = [...new Set(DATA.map(r => r.scenario))];
+const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
+const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
+const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
+const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
+const bestModel = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
+}).sort((a,b) => b.avg - a.avg)[0];
+const fastestModel = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
+}).sort((a,b) => b.speed - a.speed)[0];
+
+document.getElementById('cards').innerHTML = `
+  <div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
+  <div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
+  <div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
+  <div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
+  <div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
+  <div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
+`;
+
+// Summary table
+const sumHead = document.querySelector('#summary-table thead');
+const sumBody = document.querySelector('#summary-table tbody');
+sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
+
+const modelRows = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
+  const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
+  const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
+  const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
+  const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
+  const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
+  const scenCols = scenarios.map(s => {
+    const r = mrs.find(r => r.scenario === s);
+    if (!r) return '<td>-</td>';
+    return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
+  }).join('');
+  return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
+}).sort((a,b) => b.avg - a.avg);
+sumBody.innerHTML = modelRows.map(r => r.html).join('');
+
+// Results table
+const resHead = document.querySelector('#results-table thead');
+const resBody = document.querySelector('#results-table tbody');
+const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
+resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
+
+let sortCol = 9, sortAsc = false;
+function renderResults() {
+  const sorted = [...DATA].sort((a,b) => {
+    const vals = [
+      [a.model, b.model],
+      [a.scenario, b.scenario],
+      [a.specEntities, b.specEntities],
+      [a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
+      [a.fixRounds, b.fixRounds],
+      [a.promptTokensEst, b.promptTokensEst],
+      [a.totalTokens, b.totalTokens],
+      [a.totalDurationMs, b.totalDurationMs],
+      [a.avgTokPerSec, b.avgTokPerSec],
+      [a.score, b.score],
+    ][sortCol];
+    const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
+    return sortAsc ? cmp : -cmp;
+  });
+  resBody.innerHTML = sorted.map(r => {
+    const c = cls(r);
+    return `<tr>
+      <td class="model-name">${r.model}</td>
+      <td>${r.scenario}</td>
+      <td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
+      <td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
+      <td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
+      <td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
+      <td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
+      <td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
+      <td>${r.avgTokPerSec.toFixed(0)}</td>
+      <td><span class="stars">${r.stars}</span> ${r.score}p</td>
+    </tr>`;
+  }).join('');
+  document.querySelectorAll('#results-table th').forEach((th,i) => {
+    th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
+  });
+}
+document.querySelector('#results-table thead').addEventListener('click', e => {
+  const col = parseInt(e.target.dataset.col);
+  if (isNaN(col)) return;
+  if (sortCol === col) sortAsc = !sortAsc;
+  else { sortCol = col; sortAsc = false; }
+  renderResults();
+});
+renderResults();
+</script>
+</body>
+</html>
--- a/kipina-codebench/results/2026-04-14_mistral.json
+++ b/kipina-codebench/results/2026-04-14_mistral.json
@@ -0,0 +1,182 @@
+[
+  {
+    "model": "codestral:22b",
+    "scenario": "todo",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 6,
+    "testsPassed": 6,
+    "testsFailed": 0,
+    "totalDurationMs": 63028,
+    "totalTokens": 2390,
+    "avgTokPerSec": 44.09843659433429,
+    "promptChars": 9567,
+    "promptTokensEst": 2392,
+    "score": 100,
+    "stars": "★★★★★",
+    "error": null
+  },
+  {
+    "model": "codestral:22b",
+    "scenario": "users",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 4,
+    "testsPassed": 4,
+    "testsFailed": 0,
+    "totalDurationMs": 58359,
+    "totalTokens": 2313,
+    "avgTokPerSec": 44.04431775388366,
+    "promptChars": 9641,
+    "promptTokensEst": 2410,
+    "score": 100,
+    "stars": "★★★★★",
+    "error": null
+  },
+  {
+    "model": "codestral:22b",
+    "scenario": "blog",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 1,
+    "testsPassed": 0,
+    "testsFailed": 1,
+    "totalDurationMs": 52020,
+    "totalTokens": 2073,
+    "avgTokPerSec": 44.03716103774298,
+    "promptChars": 10007,
+    "promptTokensEst": 2502,
+    "score": 40,
+    "stars": "★★☆☆☆",
+    "error": null
+  },
+  {
+    "model": "mistral-small3.1:24b",
+    "scenario": "todo",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 7,
+    "testsPassed": 6,
+    "testsFailed": 1,
+    "totalDurationMs": 76602,
+    "totalTokens": 2820,
+    "avgTokPerSec": 41.65340751865168,
+    "promptChars": 10816,
+    "promptTokensEst": 2704,
+    "score": 91,
+    "stars": "★★★★★",
+    "error": null
+  },
+  {
+    "model": "mistral-small3.1:24b",
+    "scenario": "users",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 0,
+    "testsPassed": 0,
+    "testsFailed": 0,
+    "totalDurationMs": 0,
+    "totalTokens": 0,
+    "avgTokPerSec": 0,
+    "promptChars": 11004,
+    "promptTokensEst": 2751,
+    "score": 0,
+    "stars": "",
+    "error": "Puuttuvat: test_main.py"
+  },
+  {
+    "model": "mistral-small3.1:24b",
+    "scenario": "blog",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 0,
+    "testsPassed": 0,
+    "testsFailed": 0,
+    "totalDurationMs": 0,
+    "totalTokens": 0,
+    "avgTokPerSec": 0,
+    "promptChars": 10573,
+    "promptTokensEst": 2643,
+    "score": 0,
+    "stars": "",
+    "error": "Puuttuvat: test_main.py"
+  },
+  {
+    "model": "devstral:24b",
+    "scenario": "todo",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 1,
+    "testsPassed": 0,
+    "testsFailed": 1,
+    "totalDurationMs": 54454,
+    "totalTokens": 1952,
+    "avgTokPerSec": 42.767057828688735,
+    "promptChars": 9829,
+    "promptTokensEst": 2457,
+    "score": 40,
+    "stars": "★★☆☆☆",
+    "error": null
+  },
+  {
+    "model": "devstral:24b",
+    "scenario": "users",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 5,
+    "testsPassed": 1,
+    "testsFailed": 4,
+    "totalDurationMs": 50447,
+    "totalTokens": 1954,
+    "avgTokPerSec": 42.79877112859477,
+    "promptChars": 9678,
+    "promptTokensEst": 2420,
+    "score": 52,
+    "stars": "★★★☆☆",
+    "error": null
+  },
+  {
+    "model": "devstral:24b",
+    "scenario": "blog",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 1,
+    "testsPassed": 0,
+    "testsFailed": 1,
+    "totalDurationMs": 83061,
+    "totalTokens": 3251,
+    "avgTokPerSec": 42.647732012717476,
+    "promptChars": 10561,
+    "promptTokensEst": 2640,
+    "score": 40,
+    "stars": "★★☆☆☆",
+    "error": null
+  }
+]
--- a/kipina-codebench/results/2026-04-14_top3.html
+++ b/kipina-codebench/results/2026-04-14_top3.html
@@ -0,0 +1,183 @@
+<!DOCTYPE html>
+<html lang="fi">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Kipina Model Benchmark</title>
+<style>
+  :root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
+  h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
+  .meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
+  .cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
+  .card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
+  .card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
+  .card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
+  .card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
+  table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
+  th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
+  th:hover { color: var(--text); }
+  th.sorted-asc::after { content: ' ▲'; }
+  th.sorted-desc::after { content: ' ▼'; }
+  td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
+  tr:hover td { background: #1c2128; }
+  .pass { color: var(--green); }
+  .partial { color: var(--yellow); }
+  .fail { color: var(--red); }
+  .stars { letter-spacing: 1px; }
+  .bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
+  .bar-bg { background: var(--border); }
+  .bar-fill { background: var(--green); }
+  .bar-partial { background: var(--yellow); }
+  .model-name { font-weight: 600; }
+  h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
+  .summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
+</style>
+</head>
+<body>
+
+<h1>Kipina Model Benchmark</h1>
+<div class="meta" id="meta"></div>
+
+<div class="cards" id="cards"></div>
+
+<h2>Mallikohtainen yhteenveto</h2>
+<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
+
+<h2>Kaikki tulokset</h2>
+<table id="results-table"><thead></thead><tbody></tbody></table>
+
+<script>
+const RAW = [{"model":"qwen3.5:35b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":63592,"totalTokens":4103,"avgTokPerSec":88.29857987765199,"promptChars":11310,"promptTokensEst":2828,"score":40,"stars":"★★☆☆☆","error":null},{"model":"qwen3.5:35b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":35262,"totalTokens":2733,"avgTokPerSec":88.26749243915684,"promptChars":10165,"promptTokensEst":2541,"score":40,"stars":"★★☆☆☆","error":null},{"model":"qwen3.5:35b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":60346,"totalTokens":4728,"avgTokPerSec":87.67792775342463,"promptChars":11661,"promptTokensEst":2915,"score":40,"stars":"★★☆☆☆","error":null},{"model":"codestral:22b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":5,"testsPassed":4,"testsFailed":1,"totalDurationMs":80515,"totalTokens":3081,"avgTokPerSec":43.828884806830445,"promptChars":10150,"promptTokensEst":2538,"score":88,"stars":"★★★★☆","error":null},{"model":"codestral:22b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":5,"testsPassed":3,"testsFailed":2,"totalDurationMs":61598,"totalTokens":2441,"avgTokPerSec":44.017116943523455,"promptChars":9288,"promptTokensEst":2322,"score":76,"stars":"★★★★☆","error":null},{"model":"codestral:22b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":81568,"totalTokens":3229,"avgTokPerSec":43.67638078062432,"promptChars":10475,"promptTokensEst":2619,"score":100,"stars":"★★★★★","error":null},{"model":"qwen3-coder:30b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":30315,"totalTokens":2379,"avgTokPerSec":123.42041099401449,"promptChars":10111,"promptTokensEst":2528,"score":100,"stars":"★★★★★","error":null},{"model":"qwen3-coder:30b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":7,"testsPassed":6,"testsFailed":1,"totalDurationMs":23071,"totalTokens":2443,"avgTokPerSec":123.11212122029796,"promptChars":9150,"promptTokensEst":2288,"score":91,"stars":"★★★★★","error":null},{"model":"qwen3-coder:30b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":11,"testsPassed":11,"testsFailed":0,"totalDurationMs":40933,"totalTokens":4370,"avgTokPerSec":121.8144240305409,"promptChars":10789,"promptTokensEst":2697,"score":100,"stars":"★★★★★","error":null}];
+
+const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
+function calcScore(r) {
+  if (r.error && r.testsTotal === 0) return 0;
+  let s = 0;
+  if (r.specOk) s += 10;
+  if (!r.error || r.testsTotal > 0) s += 10;
+  if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
+  s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
+  return Math.min(100, s);
+}
+// Laske pisteet jos puuttuvat
+const DATA = RAW.map(r => {
+  if (r.score == null) r.score = calcScore(r);
+  if (!r.stars) r.stars = starsFor(r.score);
+  if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
+  return r;
+});
+const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
+const pctBar = (passed, total, w=80) => {
+  if (total === 0) return '-';
+  const pct = passed/total*100;
+  const c = pct === 100 ? 'bar-fill' : 'bar-partial';
+  return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
+};
+
+// Meta
+const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
+document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
+
+// Cards
+const models = [...new Set(DATA.map(r => r.model))];
+const scenarios = [...new Set(DATA.map(r => r.scenario))];
+const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
+const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
+const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
+const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
+const bestModel = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
+}).sort((a,b) => b.avg - a.avg)[0];
+const fastestModel = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
+}).sort((a,b) => b.speed - a.speed)[0];
+
+document.getElementById('cards').innerHTML = `
+  <div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
+  <div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
+  <div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
+  <div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
+  <div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
+  <div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
+`;
+
+// Summary table
+const sumHead = document.querySelector('#summary-table thead');
+const sumBody = document.querySelector('#summary-table tbody');
+sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
+
+const modelRows = models.map(m => {
+  const mrs = DATA.filter(r => r.model === m);
+  const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
+  const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
+  const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
+  const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
+  const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
+  const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
+  const scenCols = scenarios.map(s => {
+    const r = mrs.find(r => r.scenario === s);
+    if (!r) return '<td>-</td>';
+    return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
+  }).join('');
+  return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
+}).sort((a,b) => b.avg - a.avg);
+sumBody.innerHTML = modelRows.map(r => r.html).join('');
+
+// Results table
+const resHead = document.querySelector('#results-table thead');
+const resBody = document.querySelector('#results-table tbody');
+const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
+resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
+
+let sortCol = 9, sortAsc = false;
+function renderResults() {
+  const sorted = [...DATA].sort((a,b) => {
+    const vals = [
+      [a.model, b.model],
+      [a.scenario, b.scenario],
+      [a.specEntities, b.specEntities],
+      [a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
+      [a.fixRounds, b.fixRounds],
+      [a.promptTokensEst, b.promptTokensEst],
+      [a.totalTokens, b.totalTokens],
+      [a.totalDurationMs, b.totalDurationMs],
+      [a.avgTokPerSec, b.avgTokPerSec],
+      [a.score, b.score],
+    ][sortCol];
+    const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
+    return sortAsc ? cmp : -cmp;
+  });
+  resBody.innerHTML = sorted.map(r => {
+    const c = cls(r);
+    return `<tr>
+      <td class="model-name">${r.model}</td>
+      <td>${r.scenario}</td>
+      <td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
+      <td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
+      <td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
+      <td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
+      <td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
+      <td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
+      <td>${r.avgTokPerSec.toFixed(0)}</td>
+      <td><span class="stars">${r.stars}</span> ${r.score}p</td>
+    </tr>`;
+  }).join('');
+  document.querySelectorAll('#results-table th').forEach((th,i) => {
+    th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
+  });
+}
+document.querySelector('#results-table thead').addEventListener('click', e => {
+  const col = parseInt(e.target.dataset.col);
+  if (isNaN(col)) return;
+  if (sortCol === col) sortAsc = !sortAsc;
+  else { sortCol = col; sortAsc = false; }
+  renderResults();
+});
+renderResults();
+</script>
+</body>
+</html>
--- a/kipina-codebench/results/2026-04-14_top3.json
+++ b/kipina-codebench/results/2026-04-14_top3.json
@@ -0,0 +1,182 @@
+[
+  {
+    "model": "qwen3.5:35b",
+    "scenario": "todo",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 1,
+    "testsPassed": 0,
+    "testsFailed": 1,
+    "totalDurationMs": 63592,
+    "totalTokens": 4103,
+    "avgTokPerSec": 88.29857987765199,
+    "promptChars": 11310,
+    "promptTokensEst": 2828,
+    "score": 40,
+    "stars": "★★☆☆☆",
+    "error": null
+  },
+  {
+    "model": "qwen3.5:35b",
+    "scenario": "users",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 1,
+    "testsPassed": 0,
+    "testsFailed": 1,
+    "totalDurationMs": 35262,
+    "totalTokens": 2733,
+    "avgTokPerSec": 88.26749243915684,
+    "promptChars": 10165,
+    "promptTokensEst": 2541,
+    "score": 40,
+    "stars": "★★☆☆☆",
+    "error": null
+  },
+  {
+    "model": "qwen3.5:35b",
+    "scenario": "blog",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 1,
+    "testsPassed": 0,
+    "testsFailed": 1,
+    "totalDurationMs": 60346,
+    "totalTokens": 4728,
+    "avgTokPerSec": 87.67792775342463,
+    "promptChars": 11661,
+    "promptTokensEst": 2915,
+    "score": 40,
+    "stars": "★★☆☆☆",
+    "error": null
+  },
+  {
+    "model": "codestral:22b",
+    "scenario": "todo",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 5,
+    "testsPassed": 4,
+    "testsFailed": 1,
+    "totalDurationMs": 80515,
+    "totalTokens": 3081,
+    "avgTokPerSec": 43.828884806830445,
+    "promptChars": 10150,
+    "promptTokensEst": 2538,
+    "score": 88,
+    "stars": "★★★★☆",
+    "error": null
+  },
+  {
+    "model": "codestral:22b",
+    "scenario": "users",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 5,
+    "testsPassed": 3,
+    "testsFailed": 2,
+    "totalDurationMs": 61598,
+    "totalTokens": 2441,
+    "avgTokPerSec": 44.017116943523455,
+    "promptChars": 9288,
+    "promptTokensEst": 2322,
+    "score": 76,
+    "stars": "★★★★☆",
+    "error": null
+  },
+  {
+    "model": "codestral:22b",
+    "scenario": "blog",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 6,
+    "testsPassed": 6,
+    "testsFailed": 0,
+    "totalDurationMs": 81568,
+    "totalTokens": 3229,
+    "avgTokPerSec": 43.67638078062432,
+    "promptChars": 10475,
+    "promptTokensEst": 2619,
+    "score": 100,
+    "stars": "★★★★★",
+    "error": null
+  },
+  {
+    "model": "qwen3-coder:30b",
+    "scenario": "todo",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 6,
+    "testsPassed": 6,
+    "testsFailed": 0,
+    "totalDurationMs": 30315,
+    "totalTokens": 2379,
+    "avgTokPerSec": 123.42041099401449,
+    "promptChars": 10111,
+    "promptTokensEst": 2528,
+    "score": 100,
+    "stars": "★★★★★",
+    "error": null
+  },
+  {
+    "model": "qwen3-coder:30b",
+    "scenario": "users",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 1,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 7,
+    "testsPassed": 6,
+    "testsFailed": 1,
+    "totalDurationMs": 23071,
+    "totalTokens": 2443,
+    "avgTokPerSec": 123.11212122029796,
+    "promptChars": 9150,
+    "promptTokensEst": 2288,
+    "score": 91,
+    "stars": "★★★★★",
+    "error": null
+  },
+  {
+    "model": "qwen3-coder:30b",
+    "scenario": "blog",
+    "reqOk": true,
+    "specOk": true,
+    "specEntities": 2,
+    "validationIssues": 0,
+    "fixRounds": 0,
+    "testsTotal": 11,
+    "testsPassed": 11,
+    "testsFailed": 0,
+    "totalDurationMs": 40933,
+    "totalTokens": 4370,
+    "avgTokPerSec": 121.8144240305409,
+    "promptChars": 10789,
+    "promptTokensEst": 2697,
+    "score": 100,
+    "stars": "★★★★★",
+    "error": null
+  }
+]
				`@@ -0,0 +1 @@`
				`You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.`