From 20cea8f268692340190ba57449dd82181016f54c Mon Sep 17 00:00:00 2001
From: jaakko <jaakko@kipina.tech>
Date: Mon, 13 Apr 2026 22:08:47 +0300
Subject: [PATCH] =?UTF-8?q?Model=20benchmark:=20testaa=20kaikki=20Ollama-m?=
 =?UTF-8?q?allit=20j=C3=A4rjestelm=C3=A4llisesti?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ajaa täyden pipeline-kierroksen per malli × skenaario:
1. Client-prompti → vaatimukset
2. Manager/SPEC_SYSTEM → JSON-speksi
3. Template-generointi → koodi
4. Validointi + LLM-korjaussilmukka
5. uv sync + pytest

Tuottaa vertailutaulukon: speksin laatu, testien tulos, nopeus.
Tukee suoraa Ollamaa (--ollama) ja hub-reittiä (--hub).
---
 network-poc/tests/model-benchmark.mjs | 513 ++++++++++++++++++++++++++
 1 file changed, 513 insertions(+)
 create mode 100644 network-poc/tests/model-benchmark.mjs

diff --git a/network-poc/tests/model-benchmark.mjs b/network-poc/tests/model-benchmark.mjs
new file mode 100644
index 0000000..6180dfd
--- /dev/null
+++ b/network-poc/tests/model-benchmark.mjs
@@ -0,0 +1,513 @@
+#!/usr/bin/env node
+/**
+ * Kipinä Model Benchmark
+ *
+ * Generoi projekteja eri Ollama-malleilla ja testaa niiden toimivuus.
+ * Käyttö:
+ *   node model-benchmark.mjs                          # kaikki mallit, oletusskenaario
+ *   node model-benchmark.mjs --models qwen3:8b,qwen3:30b
+ *   node model-benchmark.mjs --ollama http://host:11434
+ *   node model-benchmark.mjs --scenarios all          # kaikki skenaariot
+ */
+
+import { execSync } from 'child_process';
+import { writeFileSync, mkdirSync, rmSync, existsSync } from 'fs';
+
+// === CLI-argumentit ===
+const args = process.argv.slice(2);
+function arg(name, fallback) {
+    const i = args.indexOf(`--${name}`);
+    return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
+}
+const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434');
+const HUB_URL = arg('hub', '');  // Vaihtoehto: --hub https://kipina.studio
+const FILTER_MODELS = arg('models', '');
+const SCENARIO_FILTER = arg('scenarios', 'default');
+const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark');
+const MAX_FIX_ROUNDS = 2;
+
+// === Ollama / Hub -client ===
+async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
+    const start = Date.now();
+
+    if (HUB_URL) {
+        // Hub-reitti: /api/v1/chat/completions
+        const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
+        const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
+        });
+        if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
+        const data = await resp.json();
+        const elapsed = Date.now() - start;
+        return {
+            text: (data.response || '').trim(),
+            tokens: data.tokens_generated || 0,
+            durationMs: elapsed,
+            tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
+        };
+    }
+
+    // Suora Ollama-reitti: /api/chat
+    const messages = [];
+    if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
+    messages.push({ role: 'user', content: prompt });
+
+    const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+            model,
+            messages,
+            stream: false,
+            options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
+        }),
+    });
+    if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
+    const data = await resp.json();
+    const elapsed = Date.now() - start;
+    const text = (data.message?.content || '').trim();
+    const evalCount = data.eval_count || 0;
+    const evalDurationNs = data.eval_duration || 1;
+    const tokPerSec = evalCount / (evalDurationNs / 1e9);
+    return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
+}
+
+async function ollamaListModels() {
+    const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
+    const resp = await fetch(url);
+    if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
+    const data = await resp.json();
+    return (data.models || []).map(m => m.name);
+}
+
+// === Promptit (kopioitu index.astrosta) ===
+const CLIENT_SYSTEM = `You are a product owner who turns vague ideas into clear, actionable software requirements.
+
+GIVEN a short project description from the user, produce a structured brief:
+
+1. PROJECT NAME: a short, descriptive name
+2. GOAL: one sentence explaining what the software does and who it's for
+3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes)
+4. DATA MODEL: list the main entities and their key fields (include field types)
+5. API ENDPOINTS: list the REST endpoints (method + path + purpose)
+6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed")
+
+RULES:
+- Be specific: "User can filter todos by status" not "todo management"
+- Use plain English, no code
+- Maximum 400 words total`;
+
+const SPEC_SYSTEM = `You are a software architect who designs database schemas for Python web applications.
+
+THINK STEP BY STEP before outputting JSON:
+1. What are the main ENTITIES (nouns) in this project?
+2. What FIELDS does each entity need? (name, type, required?)
+3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id)
+4. Are there Date/DateTime fields? → add extra_imports
+
+Then output ONLY valid JSON (no explanations before or after).
+
+SCHEMA:
+{"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]}
+
+FIELD RULES:
+- sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float
+- py_type: str, int, float, bool, date, datetime — append " | None" if nullable
+- Status fields: use String(20) with default value, NEVER Enum
+- Every entity gets "id" automatically — do NOT add id or redundant ID fields
+- Use snake_case for field names
+
+RELATIONSHIP RULES:
+- If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry
+- EVERY _id field MUST have a matching relationship entry
+- Parent entities must appear BEFORE children in the entities array
+- If no relationships, set "relationships": []
+
+AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish)
+
+EXAMPLES (adapt, don't copy):
+Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending")
+Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")`;
+
+const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.';
+
+// === Template-funktiot (kopioitu korjatusta index.astrosta) ===
+function pyLiteral(val) {
+    if (val === true) return 'True';
+    if (val === false) return 'False';
+    if (val === null || val === undefined) return 'None';
+    if (typeof val === 'string') return `"${val}"`;
+    return String(val);
+}
+function pyJsonLiteral(obj) {
+    const parts = Object.entries(obj).map(([k, v]) => {
+        let pyVal;
+        if (v === true) pyVal = 'True'; else if (v === false) pyVal = 'False';
+        else if (v === null) pyVal = 'None'; else if (typeof v === 'string') pyVal = `"${v}"`;
+        else pyVal = String(v);
+        return `"${k}":${pyVal}`;
+    });
+    return '{' + parts.join(',') + '}';
+}
+function tmplModels(spec) {
+    const saTypes = new Set(['Integer']);
+    for (const e of spec.entities) for (const f of e.fields) saTypes.add(f.sa_type.match(/^(\w+)/)[1]);
+    const relMap = {};
+    for (const r of (spec.relationships || [])) {
+        const target = spec.entities.find(e => e.name === r.to);
+        if (target) relMap[`${r.from}.${r.field}`] = target.table_name;
+    }
+    if (Object.keys(relMap).length > 0) saTypes.add('ForeignKey');
+    const imports = [...saTypes].sort().join(', ');
+    let code = `from sqlalchemy import create_engine, Column, ${imports}\nfrom sqlalchemy.orm import declarative_base, sessionmaker\n\nDATABASE_URL = "sqlite:///./app.db"\nengine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})\nSessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)\nBase = declarative_base()\n\n`;
+    for (const e of spec.entities) {
+        code += `class ${e.name}(Base):\n    __tablename__ = "${e.table_name}"\n    id = Column(Integer, primary_key=True, index=True)\n`;
+        for (const f of e.fields) {
+            const fkTarget = relMap[`${e.name}.${f.name}`];
+            let parts = fkTarget ? [`Column(${f.sa_type}, ForeignKey("${fkTarget}.id")`] : [`Column(${f.sa_type}`];
+            if (!f.nullable) parts.push('nullable=False');
+            if (f.default !== null && f.default !== undefined) parts.push(`default=${pyLiteral(f.default)}`);
+            code += `    ${f.name} = ${parts.join(', ')})\n`;
+        }
+        code += '\n';
+    }
+    code += 'Base.metadata.create_all(bind=engine)\n';
+    return code;
+}
+function tmplSchemas(spec) {
+    const dtTypes = new Set();
+    for (const e of spec.entities) for (const f of e.fields) {
+        if (/\bdate\b/i.test(f.py_type) && !/datetime/.test(f.py_type)) dtTypes.add('date');
+        if (/\bdatetime\b/i.test(f.py_type)) dtTypes.add('datetime');
+    }
+    let code = 'from pydantic import BaseModel, ConfigDict\n';
+    if (dtTypes.size > 0) code += `from datetime import ${[...dtTypes].sort().join(', ')}\n`;
+    for (const imp of (spec.extra_imports || [])) {
+        if (/^(date|datetime)$/.test(imp.trim())) continue;
+        if (/^from\s/.test(imp) || /^import\s/.test(imp)) code += imp + '\n';
+    }
+    code += '\n';
+    for (const e of spec.entities) {
+        code += `class ${e.name}Create(BaseModel):\n`;
+        for (const f of e.fields) {
+            if (f.default !== null && f.default !== undefined) code += `    ${f.name}: ${f.py_type} = ${pyLiteral(f.default)}\n`;
+            else if (f.nullable && f.py_type.includes('None')) code += `    ${f.name}: ${f.py_type} = None\n`;
+            else code += `    ${f.name}: ${f.py_type}\n`;
+        }
+        code += `\nclass ${e.name}Response(${e.name}Create):\n    id: int\n    model_config = ConfigDict(from_attributes=True)\n\n`;
+    }
+    return code;
+}
+function tmplMain(spec) {
+    const modelNames = spec.entities.map(e => e.name).join(', ');
+    const createNames = spec.entities.map(e => e.name+'Create').join(', ');
+    const responseNames = spec.entities.map(e => e.name+'Response').join(', ');
+    let code = `from fastapi import FastAPI, Depends, HTTPException\nfrom sqlalchemy.orm import Session\nfrom models import Base, engine, SessionLocal, ${modelNames}\nfrom schemas import ${createNames}, ${responseNames}\n\napp = FastAPI()\n\ndef get_db():\n    db = SessionLocal()\n    try:\n        yield db\n    finally:\n        db.close()\n\n`;
+    for (const e of spec.entities) {
+        const lo = e.name.toLowerCase(), tb = e.table_name;
+        code += `@app.post("/${tb}/", response_model=${e.name}Response, status_code=201)\ndef create_${lo}(item: ${e.name}Create, db: Session = Depends(get_db)):\n    db_item = ${e.name}(**item.model_dump())\n    db.add(db_item)\n    db.commit()\n    db.refresh(db_item)\n    return db_item\n\n`;
+        code += `@app.get("/${tb}/", response_model=list[${e.name}Response])\ndef list_${lo}s(db: Session = Depends(get_db)):\n    return db.query(${e.name}).all()\n\n`;
+        code += `@app.get("/${tb}/{item_id}", response_model=${e.name}Response)\ndef get_${lo}(item_id: int, db: Session = Depends(get_db)):\n    item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n    if not item:\n        raise HTTPException(status_code=404, detail="${e.name} not found")\n    return item\n\n`;
+        code += `@app.put("/${tb}/{item_id}", response_model=${e.name}Response)\ndef update_${lo}(item_id: int, item: ${e.name}Create, db: Session = Depends(get_db)):\n    db_item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n    if not db_item:\n        raise HTTPException(status_code=404, detail="${e.name} not found")\n    for key, value in item.model_dump().items():\n        setattr(db_item, key, value)\n    db.commit()\n    db.refresh(db_item)\n    return db_item\n\n`;
+        code += `@app.delete("/${tb}/{item_id}", status_code=204)\ndef delete_${lo}(item_id: int, db: Session = Depends(get_db)):\n    db_item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n    if not db_item:\n        raise HTTPException(status_code=404, detail="${e.name} not found")\n    db.delete(db_item)\n    db.commit()\n\n`;
+    }
+    return code;
+}
+function tmplTests(spec) {
+    let code = `from fastapi.testclient import TestClient\nfrom sqlalchemy import create_engine\nfrom sqlalchemy.orm import sessionmaker\nfrom main import app, get_db\nfrom models import Base\n\nTEST_DB = "sqlite:///./test.db"\ntest_engine = create_engine(TEST_DB, connect_args={"check_same_thread": False})\nTestSession = sessionmaker(autocommit=False, autoflush=False, bind=test_engine)\nBase.metadata.create_all(bind=test_engine)\n\ndef override_get_db():\n    db = TestSession()\n    try:\n        yield db\n    finally:\n        db.close()\n\napp.dependency_overrides[get_db] = override_get_db\nclient = TestClient(app)\n\n`;
+    for (const e of spec.entities) {
+        const lo = e.name.toLowerCase(), tb = e.table_name;
+        const testData = {};
+        for (const f of e.fields) {
+            if (f.default !== null && f.default !== undefined) { testData[f.name] = f.default; continue; }
+            if (f.py_type.includes('str')) testData[f.name] = `Test ${f.name}`;
+            else if (f.py_type.includes('int')) testData[f.name] = 1;
+            else if (f.py_type.includes('float')) testData[f.name] = 1.0;
+            else if (f.py_type.includes('bool')) testData[f.name] = true;
+            else if (f.py_type.includes('date')) testData[f.name] = '2024-01-15';
+        }
+        const td = pyJsonLiteral(testData);
+        const firstStr = e.fields.find(f => f.py_type.includes('str') && f.name !== 'status');
+        const updateData = {...testData};
+        if (firstStr) updateData[firstStr.name] = `Updated ${firstStr.name}`;
+        const ud = pyJsonLiteral(updateData);
+        code += `def test_create_${lo}():\n    response = client.post('/${tb}/', json=${td})\n    assert response.status_code == 201\n    assert 'id' in response.json()\n\n`;
+        code += `def test_list_${lo}s():\n    client.post('/${tb}/', json=${td})\n    response = client.get('/${tb}/')\n    assert response.status_code == 200\n    assert len(response.json()) >= 1\n\n`;
+        code += `def test_get_${lo}_by_id():\n    created = client.post('/${tb}/', json=${td}).json()\n    item_id = created['id']\n    response = client.get(f'/${tb}/{item_id}')\n    assert response.status_code == 200\n    assert response.json()['id'] == item_id\n\n`;
+        code += `def test_get_${lo}_not_found():\n    response = client.get('/${tb}/99999')\n    assert response.status_code == 404\n\n`;
+        code += `def test_update_${lo}():\n    created = client.post('/${tb}/', json=${td}).json()\n    item_id = created['id']\n    response = client.put(f'/${tb}/{item_id}', json=${ud})\n    assert response.status_code == 200\n\n`;
+        code += `def test_delete_${lo}():\n    created = client.post('/${tb}/', json=${td}).json()\n    item_id = created['id']\n    response = client.delete(f'/${tb}/{item_id}')\n    assert response.status_code == 204\n    response = client.get(f'/${tb}/{item_id}')\n    assert response.status_code == 404\n\n`;
+    }
+    return code;
+}
+function tmplPyproject(spec) {
+    const name = (spec.project_name || 'app').toLowerCase().replace(/\s+/g, '-');
+    return `[project]\nname = "${name}"\nversion = "0.1.0"\nrequires-python = ">=3.11"\ndependencies = [\n    "fastapi",\n    "uvicorn[standard]",\n    "sqlalchemy",\n    "pytest",\n    "httpx",\n]\n`;
+}
+
+// === Validaattori ===
+function validateProjectCode(files) {
+    const issues = [];
+    for (const [fname, code] of Object.entries(files)) {
+        if (!fname.endsWith('.py')) continue;
+        const lines = code.split('\n');
+        for (const line of lines) {
+            const m = line.match(/^from\s+\.(\w*)\s+import/);
+            if (m) issues.push(`ISSUE: ${fname}: relatiivinen import`);
+        }
+        for (const line of lines) {
+            const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
+            if (!m) continue;
+            const srcCode = files[m[1] + '.py'];
+            if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
+            const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
+            for (const name of names) {
+                if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
+            }
+        }
+        if (fname === 'schemas.py') {
+            if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
+                issues.push('ISSUE: schemas.py: date-import puuttuu');
+            if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
+                issues.push('ISSUE: schemas.py: datetime-import puuttuu');
+        }
+        for (let i = 0; i < lines.length; i++) {
+            const line = lines[i];
+            if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
+            if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
+            if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
+        }
+    }
+    return issues;
+}
+
+function extractJson(text) {
+    const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
+    if (m) text = m[1].trim();
+    let depth = 0, start = null;
+    for (let i = 0; i < text.length; i++) {
+        if (text[i] === '{') { if (depth === 0) start = i; depth++; }
+        else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
+    }
+    return null;
+}
+
+// === Testiskenaariot ===
+const SCENARIOS = [
+    { id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
+    { id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
+    { id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
+];
+
+// === Pipeline: yhdelle mallille ja skenaariolle ===
+async function runPipeline(model, scenario) {
+    const result = {
+        model, scenario: scenario.id,
+        reqOk: false, specOk: false, specEntities: 0,
+        validationIssues: 0, fixRounds: 0,
+        testsTotal: 0, testsPassed: 0, testsFailed: 0,
+        totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
+        error: null,
+    };
+    const timings = [];
+    const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
+    mkdirSync(dir, { recursive: true });
+
+    try {
+        // 1. Vaatimukset
+        console.log(`    [1/5] Vaatimukset...`);
+        const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 1024);
+        timings.push(req);
+        if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
+        result.reqOk = true;
+        writeFileSync(`${dir}/_requirements.txt`, req.text);
+
+        // 2. JSON-speksi
+        console.log(`    [2/5] JSON-speksi...`);
+        const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 2048);
+        timings.push(specResp);
+        const spec = extractJson(specResp.text);
+        if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
+        result.specOk = true;
+        result.specEntities = spec.entities.length;
+        writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
+
+        // 3. Template-generointi
+        console.log(`    [3/5] Koodigenerointi...`);
+        const files = {
+            'models.py': tmplModels(spec),
+            'schemas.py': tmplSchemas(spec),
+            'main.py': tmplMain(spec),
+            'test_main.py': tmplTests(spec),
+            'pyproject.toml': tmplPyproject(spec),
+        };
+
+        // 4. Validointi + korjaussilmukka
+        let issues = validateProjectCode(files);
+        let fixRound = 0;
+        while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
+            fixRound++;
+            console.log(`    [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
+            const issuesByFile = {};
+            for (const issue of issues) {
+                const m = issue.match(/^ISSUE:\s*(\S+?):/);
+                const fname = m ? m[1] : 'unknown';
+                if (!issuesByFile[fname]) issuesByFile[fname] = [];
+                issuesByFile[fname].push(issue);
+            }
+            for (const [fname, fIssues] of Object.entries(issuesByFile)) {
+                if (!files[fname]) continue;
+                const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
+                const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
+                timings.push(fixResp);
+                if (fixResp.text) {
+                    files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
+                }
+            }
+            issues = validateProjectCode(files);
+        }
+        result.validationIssues = issues.length;
+        result.fixRounds = fixRound;
+
+        // Kirjoita tiedostot levylle
+        for (const [fn, content] of Object.entries(files)) writeFileSync(`${dir}/${fn}`, content);
+
+        // 5. Pytest
+        console.log(`    [5/5] Pytest...`);
+        try {
+            const uvPath = process.env.HOME + '/.local/bin/uv';
+            const uv = existsSync(uvPath) ? uvPath : 'uv';
+            execSync(`cd "${dir}" && ${uv} sync 2>/dev/null`, { timeout: 60000, stdio: 'pipe' });
+            execSync(`cd "${dir}" && rm -f app.db test.db`, { stdio: 'pipe' });
+            const pytestOut = execSync(`cd "${dir}" && ${uv} run pytest test_main.py -v --tb=short 2>&1`, { timeout: 60000, encoding: 'utf-8' });
+            writeFileSync(`${dir}/_pytest.txt`, pytestOut);
+
+            const passedMatch = pytestOut.match(/(\d+) passed/);
+            const failedMatch = pytestOut.match(/(\d+) failed/);
+            result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
+            result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0;
+            result.testsTotal = result.testsPassed + result.testsFailed;
+        } catch (e) {
+            const output = e.stdout || e.stderr || e.message || '';
+            writeFileSync(`${dir}/_pytest.txt`, output);
+            const passedMatch = output.match(/(\d+) passed/);
+            const failedMatch = output.match(/(\d+) failed/);
+            const errorMatch = output.match(/(\d+) error/);
+            result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
+            result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0);
+            result.testsTotal = result.testsPassed + result.testsFailed;
+            if (result.testsTotal === 0) result.error = 'Pytest kaatui';
+        }
+    } catch (e) {
+        result.error = e.message;
+    }
+
+    // Yhteenveto
+    result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
+    result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
+    result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
+
+    return result;
+}
+
+// === Main ===
+async function main() {
+    console.log('╔══════════════════════════════════════════════╗');
+    console.log('║       Kipinä Model Benchmark                ║');
+    console.log('╚══════════════════════════════════════════════╝');
+    console.log(`Ollama: ${OLLAMA_URL}`);
+
+    // Haetaan mallit
+    let models;
+    try {
+        models = await ollamaListModels();
+    } catch (e) {
+        console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
+        process.exit(1);
+    }
+
+    if (FILTER_MODELS) {
+        const filter = FILTER_MODELS.split(',').map(s => s.trim());
+        models = models.filter(m => filter.some(f => m.includes(f)));
+    }
+
+    console.log(`Mallit (${models.length}): ${models.join(', ')}`);
+
+    const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
+    console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
+    console.log(`Tulokset: ${OUTPUT_DIR}/`);
+    console.log('');
+
+    // Puhdista output
+    rmSync(OUTPUT_DIR, { recursive: true, force: true });
+    mkdirSync(OUTPUT_DIR, { recursive: true });
+
+    const results = [];
+
+    for (const model of models) {
+        for (const scenario of scenarios) {
+            console.log(`\n━━━ ${model} × ${scenario.id} ━━━`);
+            const r = await runPipeline(model, scenario);
+            results.push(r);
+
+            const status = r.error ? `✗ ${r.error}` :
+                r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
+                `◐ ${r.testsPassed}/${r.testsTotal}`;
+            console.log(`    → ${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s`);
+        }
+    }
+
+    // === Tulostaulu ===
+    console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
+    console.log('║                                    TULOKSET                                                     ║');
+    console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
+
+    const header = [
+        'Malli'.padEnd(40),
+        'Skenaario'.padEnd(10),
+        'Speksi'.padEnd(8),
+        'Testit'.padEnd(10),
+        'Korjaus'.padEnd(8),
+        'Aika'.padEnd(8),
+        'tok/s'.padEnd(8),
+        'Tulos',
+    ].join(' │ ');
+    console.log(`║ ${header} ║`);
+    console.log('╠' + '═'.repeat(header.length + 2) + '╣');
+
+    for (const r of results) {
+        const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗';
+        const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
+        const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
+        const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
+        const speed = `${r.avgTokPerSec.toFixed(0)}`;
+        const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL';
+
+        const row = [
+            r.model.padEnd(40),
+            r.scenario.padEnd(10),
+            specStatus.padEnd(8),
+            testStatus.padEnd(10),
+            fixStatus.padEnd(8),
+            time.padEnd(8),
+            speed.padEnd(8),
+            verdict,
+        ].join(' │ ');
+        console.log(`║ ${row} ║`);
+    }
+    console.log('╚' + '═'.repeat(header.length + 2) + '╝');
+
+    // Tallenna JSON
+    writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
+    console.log(`\nJSON: ${OUTPUT_DIR}/results.json`);
+
+    // Yhteenveto
+    const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
+    const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
+    const failed = results.filter(r => r.error || r.testsTotal === 0);
+    console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
+}
+
+main().catch(e => { console.error(e); process.exit(1); });