Model benchmark: testaa kaikki Ollama-mallit järjestelmällisesti

Ajaa täyden pipeline-kierroksen per malli × skenaario: 1. Client-prompti → vaatimukset 2. Manager/SPEC_SYSTEM → JSON-speksi 3. Template-generointi → koodi 4. Validointi + LLM-korjaussilmukka 5. uv sync + pytest Tuottaa vertailutaulukon: speksin laatu, testien tulos, nopeus. Tukee suoraa Ollamaa (--ollama) ja hub-reittiä (--hub).
2026-04-13 22:08:47 +03:00
1 changed files with 513 additions and 0 deletions
--- a/network-poc/tests/model-benchmark.mjs
+++ b/network-poc/tests/model-benchmark.mjs
@@ -0,0 +1,513 @@
 #!/usr/bin/env node
 /**
 * Kipinä Model Benchmark
 *
 * Generoi projekteja eri Ollama-malleilla ja testaa niiden toimivuus.
 * Käyttö:
 *   node model-benchmark.mjs                          # kaikki mallit, oletusskenaario
 *   node model-benchmark.mjs --models qwen3:8b,qwen3:30b
 *   node model-benchmark.mjs --ollama http://host:11434
 *   node model-benchmark.mjs --scenarios all          # kaikki skenaariot
 */
 import { execSync } from 'child_process';
 import { writeFileSync, mkdirSync, rmSync, existsSync } from 'fs';
 // === CLI-argumentit ===
 const args = process.argv.slice(2);
 function arg(name, fallback) {
    const i = args.indexOf(`--${name}`);
    return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
 }
 const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434');
 const HUB_URL = arg('hub', '');  // Vaihtoehto: --hub https://kipina.studio
 const FILTER_MODELS = arg('models', '');
 const SCENARIO_FILTER = arg('scenarios', 'default');
 const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark');
 const MAX_FIX_ROUNDS = 2;
 // === Ollama / Hub -client ===
 async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
    const start = Date.now();
    if (HUB_URL) {
        // Hub-reitti: /api/v1/chat/completions
        const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
        const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
        });
        if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
        const data = await resp.json();
        const elapsed = Date.now() - start;
        return {
            text: (data.response || '').trim(),
            tokens: data.tokens_generated || 0,
            durationMs: elapsed,
            tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
        };
    }
    // Suora Ollama-reitti: /api/chat
    const messages = [];
    if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
    messages.push({ role: 'user', content: prompt });
    const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
            model,
            messages,
            stream: false,
            options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
        }),
    });
    if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
    const data = await resp.json();
    const elapsed = Date.now() - start;
    const text = (data.message?.content || '').trim();
    const evalCount = data.eval_count || 0;
    const evalDurationNs = data.eval_duration || 1;
    const tokPerSec = evalCount / (evalDurationNs / 1e9);
    return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
 }
 async function ollamaListModels() {
    const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
    const resp = await fetch(url);
    if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
    const data = await resp.json();
    return (data.models || []).map(m => m.name);
 }
 // === Promptit (kopioitu index.astrosta) ===
 const CLIENT_SYSTEM = `You are a product owner who turns vague ideas into clear, actionable software requirements.
 GIVEN a short project description from the user, produce a structured brief:
 1. PROJECT NAME: a short, descriptive name
 2. GOAL: one sentence explaining what the software does and who it's for
 3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes)
 4. DATA MODEL: list the main entities and their key fields (include field types)
 5. API ENDPOINTS: list the REST endpoints (method + path + purpose)
 6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed")
 RULES:
 - Be specific: "User can filter todos by status" not "todo management"
 - Use plain English, no code
 - Maximum 400 words total`;
 const SPEC_SYSTEM = `You are a software architect who designs database schemas for Python web applications.
 THINK STEP BY STEP before outputting JSON:
 1. What are the main ENTITIES (nouns) in this project?
 2. What FIELDS does each entity need? (name, type, required?)
 3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id)
 4. Are there Date/DateTime fields? → add extra_imports
 Then output ONLY valid JSON (no explanations before or after).
 SCHEMA:
 {"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]}
 FIELD RULES:
 - sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float
 - py_type: str, int, float, bool, date, datetime — append " | None" if nullable
 - Status fields: use String(20) with default value, NEVER Enum
 - Every entity gets "id" automatically — do NOT add id or redundant ID fields
 - Use snake_case for field names
 RELATIONSHIP RULES:
 - If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry
 - EVERY _id field MUST have a matching relationship entry
 - Parent entities must appear BEFORE children in the entities array
 - If no relationships, set "relationships": []
 AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish)
 EXAMPLES (adapt, don't copy):
 Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending")
 Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")`;
 const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.';
 // === Template-funktiot (kopioitu korjatusta index.astrosta) ===
 function pyLiteral(val) {
    if (val === true) return 'True';
    if (val === false) return 'False';
    if (val === null || val === undefined) return 'None';
    if (typeof val === 'string') return `"${val}"`;
    return String(val);
 }
 function pyJsonLiteral(obj) {
    const parts = Object.entries(obj).map(([k, v]) => {
        let pyVal;
        if (v === true) pyVal = 'True'; else if (v === false) pyVal = 'False';
        else if (v === null) pyVal = 'None'; else if (typeof v === 'string') pyVal = `"${v}"`;
        else pyVal = String(v);
        return `"${k}":${pyVal}`;
    });
    return '{' + parts.join(',') + '}';
 }
 function tmplModels(spec) {
    const saTypes = new Set(['Integer']);
    for (const e of spec.entities) for (const f of e.fields) saTypes.add(f.sa_type.match(/^(\w+)/)[1]);
    const relMap = {};
    for (const r of (spec.relationships || [])) {
        const target = spec.entities.find(e => e.name === r.to);
        if (target) relMap[`${r.from}.${r.field}`] = target.table_name;
    }
    if (Object.keys(relMap).length > 0) saTypes.add('ForeignKey');
    const imports = [...saTypes].sort().join(', ');
    let code = `from sqlalchemy import create_engine, Column, ${imports}\nfrom sqlalchemy.orm import declarative_base, sessionmaker\n\nDATABASE_URL = "sqlite:///./app.db"\nengine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})\nSessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)\nBase = declarative_base()\n\n`;
    for (const e of spec.entities) {
        code += `class ${e.name}(Base):\n    __tablename__ = "${e.table_name}"\n    id = Column(Integer, primary_key=True, index=True)\n`;
        for (const f of e.fields) {
            const fkTarget = relMap[`${e.name}.${f.name}`];
            let parts = fkTarget ? [`Column(${f.sa_type}, ForeignKey("${fkTarget}.id")`] : [`Column(${f.sa_type}`];
            if (!f.nullable) parts.push('nullable=False');
            if (f.default !== null && f.default !== undefined) parts.push(`default=${pyLiteral(f.default)}`);
            code += `    ${f.name} = ${parts.join(', ')})\n`;
        }
        code += '\n';
    }
    code += 'Base.metadata.create_all(bind=engine)\n';
    return code;
 }
 function tmplSchemas(spec) {
    const dtTypes = new Set();
    for (const e of spec.entities) for (const f of e.fields) {
        if (/\bdate\b/i.test(f.py_type) && !/datetime/.test(f.py_type)) dtTypes.add('date');
        if (/\bdatetime\b/i.test(f.py_type)) dtTypes.add('datetime');
    }
    let code = 'from pydantic import BaseModel, ConfigDict\n';
    if (dtTypes.size > 0) code += `from datetime import ${[...dtTypes].sort().join(', ')}\n`;
    for (const imp of (spec.extra_imports || [])) {
        if (/^(date|datetime)$/.test(imp.trim())) continue;
        if (/^from\s/.test(imp) || /^import\s/.test(imp)) code += imp + '\n';
    }
    code += '\n';
    for (const e of spec.entities) {
        code += `class ${e.name}Create(BaseModel):\n`;
        for (const f of e.fields) {
            if (f.default !== null && f.default !== undefined) code += `    ${f.name}: ${f.py_type} = ${pyLiteral(f.default)}\n`;
            else if (f.nullable && f.py_type.includes('None')) code += `    ${f.name}: ${f.py_type} = None\n`;
            else code += `    ${f.name}: ${f.py_type}\n`;
        }
        code += `\nclass ${e.name}Response(${e.name}Create):\n    id: int\n    model_config = ConfigDict(from_attributes=True)\n\n`;
    }
    return code;
 }
 function tmplMain(spec) {
    const modelNames = spec.entities.map(e => e.name).join(', ');
    const createNames = spec.entities.map(e => e.name+'Create').join(', ');
    const responseNames = spec.entities.map(e => e.name+'Response').join(', ');
    let code = `from fastapi import FastAPI, Depends, HTTPException\nfrom sqlalchemy.orm import Session\nfrom models import Base, engine, SessionLocal, ${modelNames}\nfrom schemas import ${createNames}, ${responseNames}\n\napp = FastAPI()\n\ndef get_db():\n    db = SessionLocal()\n    try:\n        yield db\n    finally:\n        db.close()\n\n`;
    for (const e of spec.entities) {
        const lo = e.name.toLowerCase(), tb = e.table_name;
        code += `@app.post("/${tb}/", response_model=${e.name}Response, status_code=201)\ndef create_${lo}(item: ${e.name}Create, db: Session = Depends(get_db)):\n    db_item = ${e.name}(**item.model_dump())\n    db.add(db_item)\n    db.commit()\n    db.refresh(db_item)\n    return db_item\n\n`;
        code += `@app.get("/${tb}/", response_model=list[${e.name}Response])\ndef list_${lo}s(db: Session = Depends(get_db)):\n    return db.query(${e.name}).all()\n\n`;
        code += `@app.get("/${tb}/{item_id}", response_model=${e.name}Response)\ndef get_${lo}(item_id: int, db: Session = Depends(get_db)):\n    item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n    if not item:\n        raise HTTPException(status_code=404, detail="${e.name} not found")\n    return item\n\n`;
        code += `@app.put("/${tb}/{item_id}", response_model=${e.name}Response)\ndef update_${lo}(item_id: int, item: ${e.name}Create, db: Session = Depends(get_db)):\n    db_item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n    if not db_item:\n        raise HTTPException(status_code=404, detail="${e.name} not found")\n    for key, value in item.model_dump().items():\n        setattr(db_item, key, value)\n    db.commit()\n    db.refresh(db_item)\n    return db_item\n\n`;
        code += `@app.delete("/${tb}/{item_id}", status_code=204)\ndef delete_${lo}(item_id: int, db: Session = Depends(get_db)):\n    db_item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n    if not db_item:\n        raise HTTPException(status_code=404, detail="${e.name} not found")\n    db.delete(db_item)\n    db.commit()\n\n`;
    }
    return code;
 }
 function tmplTests(spec) {
    let code = `from fastapi.testclient import TestClient\nfrom sqlalchemy import create_engine\nfrom sqlalchemy.orm import sessionmaker\nfrom main import app, get_db\nfrom models import Base\n\nTEST_DB = "sqlite:///./test.db"\ntest_engine = create_engine(TEST_DB, connect_args={"check_same_thread": False})\nTestSession = sessionmaker(autocommit=False, autoflush=False, bind=test_engine)\nBase.metadata.create_all(bind=test_engine)\n\ndef override_get_db():\n    db = TestSession()\n    try:\n        yield db\n    finally:\n        db.close()\n\napp.dependency_overrides[get_db] = override_get_db\nclient = TestClient(app)\n\n`;
    for (const e of spec.entities) {
        const lo = e.name.toLowerCase(), tb = e.table_name;
        const testData = {};
        for (const f of e.fields) {
            if (f.default !== null && f.default !== undefined) { testData[f.name] = f.default; continue; }
            if (f.py_type.includes('str')) testData[f.name] = `Test ${f.name}`;
            else if (f.py_type.includes('int')) testData[f.name] = 1;
            else if (f.py_type.includes('float')) testData[f.name] = 1.0;
            else if (f.py_type.includes('bool')) testData[f.name] = true;
            else if (f.py_type.includes('date')) testData[f.name] = '2024-01-15';
        }
        const td = pyJsonLiteral(testData);
        const firstStr = e.fields.find(f => f.py_type.includes('str') && f.name !== 'status');
        const updateData = {...testData};
        if (firstStr) updateData[firstStr.name] = `Updated ${firstStr.name}`;
        const ud = pyJsonLiteral(updateData);
        code += `def test_create_${lo}():\n    response = client.post('/${tb}/', json=${td})\n    assert response.status_code == 201\n    assert 'id' in response.json()\n\n`;
        code += `def test_list_${lo}s():\n    client.post('/${tb}/', json=${td})\n    response = client.get('/${tb}/')\n    assert response.status_code == 200\n    assert len(response.json()) >= 1\n\n`;
        code += `def test_get_${lo}_by_id():\n    created = client.post('/${tb}/', json=${td}).json()\n    item_id = created['id']\n    response = client.get(f'/${tb}/{item_id}')\n    assert response.status_code == 200\n    assert response.json()['id'] == item_id\n\n`;
        code += `def test_get_${lo}_not_found():\n    response = client.get('/${tb}/99999')\n    assert response.status_code == 404\n\n`;
        code += `def test_update_${lo}():\n    created = client.post('/${tb}/', json=${td}).json()\n    item_id = created['id']\n    response = client.put(f'/${tb}/{item_id}', json=${ud})\n    assert response.status_code == 200\n\n`;
        code += `def test_delete_${lo}():\n    created = client.post('/${tb}/', json=${td}).json()\n    item_id = created['id']\n    response = client.delete(f'/${tb}/{item_id}')\n    assert response.status_code == 204\n    response = client.get(f'/${tb}/{item_id}')\n    assert response.status_code == 404\n\n`;
    }
    return code;
 }
 function tmplPyproject(spec) {
    const name = (spec.project_name || 'app').toLowerCase().replace(/\s+/g, '-');
    return `[project]\nname = "${name}"\nversion = "0.1.0"\nrequires-python = ">=3.11"\ndependencies = [\n    "fastapi",\n    "uvicorn[standard]",\n    "sqlalchemy",\n    "pytest",\n    "httpx",\n]\n`;
 }
 // === Validaattori ===
 function validateProjectCode(files) {
    const issues = [];
    for (const [fname, code] of Object.entries(files)) {
        if (!fname.endsWith('.py')) continue;
        const lines = code.split('\n');
        for (const line of lines) {
            const m = line.match(/^from\s+\.(\w*)\s+import/);
            if (m) issues.push(`ISSUE: ${fname}: relatiivinen import`);
        }
        for (const line of lines) {
            const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
            if (!m) continue;
            const srcCode = files[m[1] + '.py'];
            if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
            const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
            for (const name of names) {
                if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
            }
        }
        if (fname === 'schemas.py') {
            if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
                issues.push('ISSUE: schemas.py: date-import puuttuu');
            if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
                issues.push('ISSUE: schemas.py: datetime-import puuttuu');
        }
        for (let i = 0; i < lines.length; i++) {
            const line = lines[i];
            if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
            if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
            if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
        }
    }
    return issues;
 }
 function extractJson(text) {
    const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
    if (m) text = m[1].trim();
    let depth = 0, start = null;
    for (let i = 0; i < text.length; i++) {
        if (text[i] === '{') { if (depth === 0) start = i; depth++; }
        else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
    }
    return null;
 }
 // === Testiskenaariot ===
 const SCENARIOS = [
    { id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
    { id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
    { id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
 ];
 // === Pipeline: yhdelle mallille ja skenaariolle ===
 async function runPipeline(model, scenario) {
    const result = {
        model, scenario: scenario.id,
        reqOk: false, specOk: false, specEntities: 0,
        validationIssues: 0, fixRounds: 0,
        testsTotal: 0, testsPassed: 0, testsFailed: 0,
        totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
        error: null,
    };
    const timings = [];
    const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
    mkdirSync(dir, { recursive: true });
    try {
        // 1. Vaatimukset
        console.log(`    [1/5] Vaatimukset...`);
        const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 1024);
        timings.push(req);
        if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
        result.reqOk = true;
        writeFileSync(`${dir}/_requirements.txt`, req.text);
        // 2. JSON-speksi
        console.log(`    [2/5] JSON-speksi...`);
        const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 2048);
        timings.push(specResp);
        const spec = extractJson(specResp.text);
        if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
        result.specOk = true;
        result.specEntities = spec.entities.length;
        writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
        // 3. Template-generointi
        console.log(`    [3/5] Koodigenerointi...`);
        const files = {
            'models.py': tmplModels(spec),
            'schemas.py': tmplSchemas(spec),
            'main.py': tmplMain(spec),
            'test_main.py': tmplTests(spec),
            'pyproject.toml': tmplPyproject(spec),
        };
        // 4. Validointi + korjaussilmukka
        let issues = validateProjectCode(files);
        let fixRound = 0;
        while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
            fixRound++;
            console.log(`    [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
            const issuesByFile = {};
            for (const issue of issues) {
                const m = issue.match(/^ISSUE:\s*(\S+?):/);
                const fname = m ? m[1] : 'unknown';
                if (!issuesByFile[fname]) issuesByFile[fname] = [];
                issuesByFile[fname].push(issue);
            }
            for (const [fname, fIssues] of Object.entries(issuesByFile)) {
                if (!files[fname]) continue;
                const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
                const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
                timings.push(fixResp);
                if (fixResp.text) {
                    files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
                }
            }
            issues = validateProjectCode(files);
        }
        result.validationIssues = issues.length;
        result.fixRounds = fixRound;
        // Kirjoita tiedostot levylle
        for (const [fn, content] of Object.entries(files)) writeFileSync(`${dir}/${fn}`, content);
        // 5. Pytest
        console.log(`    [5/5] Pytest...`);
        try {
            const uvPath = process.env.HOME + '/.local/bin/uv';
            const uv = existsSync(uvPath) ? uvPath : 'uv';
            execSync(`cd "${dir}" && ${uv} sync 2>/dev/null`, { timeout: 60000, stdio: 'pipe' });
            execSync(`cd "${dir}" && rm -f app.db test.db`, { stdio: 'pipe' });
            const pytestOut = execSync(`cd "${dir}" && ${uv} run pytest test_main.py -v --tb=short 2>&1`, { timeout: 60000, encoding: 'utf-8' });
            writeFileSync(`${dir}/_pytest.txt`, pytestOut);
            const passedMatch = pytestOut.match(/(\d+) passed/);
            const failedMatch = pytestOut.match(/(\d+) failed/);
            result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
            result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0;
            result.testsTotal = result.testsPassed + result.testsFailed;
        } catch (e) {
            const output = e.stdout || e.stderr || e.message || '';
            writeFileSync(`${dir}/_pytest.txt`, output);
            const passedMatch = output.match(/(\d+) passed/);
            const failedMatch = output.match(/(\d+) failed/);
            const errorMatch = output.match(/(\d+) error/);
            result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
            result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0);
            result.testsTotal = result.testsPassed + result.testsFailed;
            if (result.testsTotal === 0) result.error = 'Pytest kaatui';
        }
    } catch (e) {
        result.error = e.message;
    }
    // Yhteenveto
    result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
    result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
    result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
    return result;
 }
 // === Main ===
 async function main() {
    console.log('╔══════════════════════════════════════════════╗');
    console.log('║       Kipinä Model Benchmark                ║');
    console.log('╚══════════════════════════════════════════════╝');
    console.log(`Ollama: ${OLLAMA_URL}`);
    // Haetaan mallit
    let models;
    try {
        models = await ollamaListModels();
    } catch (e) {
        console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
        process.exit(1);
    }
    if (FILTER_MODELS) {
        const filter = FILTER_MODELS.split(',').map(s => s.trim());
        models = models.filter(m => filter.some(f => m.includes(f)));
    }
    console.log(`Mallit (${models.length}): ${models.join(', ')}`);
    const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
    console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
    console.log(`Tulokset: ${OUTPUT_DIR}/`);
    console.log('');
    // Puhdista output
    rmSync(OUTPUT_DIR, { recursive: true, force: true });
    mkdirSync(OUTPUT_DIR, { recursive: true });
    const results = [];
    for (const model of models) {
        for (const scenario of scenarios) {
            console.log(`\n━━━ ${model} × ${scenario.id} ━━━`);
            const r = await runPipeline(model, scenario);
            results.push(r);
            const status = r.error ? `✗ ${r.error}` :
                r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
                `◐ ${r.testsPassed}/${r.testsTotal}`;
            console.log(`    → ${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s`);
        }
    }
    // === Tulostaulu ===
    console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
    console.log('║                                    TULOKSET                                                     ║');
    console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
    const header = [
        'Malli'.padEnd(40),
        'Skenaario'.padEnd(10),
        'Speksi'.padEnd(8),
        'Testit'.padEnd(10),
        'Korjaus'.padEnd(8),
        'Aika'.padEnd(8),
        'tok/s'.padEnd(8),
        'Tulos',
    ].join(' │ ');
    console.log(`║ ${header} ║`);
    console.log('╠' + '═'.repeat(header.length + 2) + '╣');
    for (const r of results) {
        const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗';
        const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
        const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
        const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
        const speed = `${r.avgTokPerSec.toFixed(0)}`;
        const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL';
        const row = [
            r.model.padEnd(40),
            r.scenario.padEnd(10),
            specStatus.padEnd(8),
            testStatus.padEnd(10),
            fixStatus.padEnd(8),
            time.padEnd(8),
            speed.padEnd(8),
            verdict,
        ].join(' │ ');
        console.log(`║ ${row} ║`);
    }
    console.log('╚' + '═'.repeat(header.length + 2) + '╝');
    // Tallenna JSON
    writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
    console.log(`\nJSON: ${OUTPUT_DIR}/results.json`);
    // Yhteenveto
    const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
    const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
    const failed = results.filter(r => r.error || r.testsTotal === 0);
    console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
 }
 main().catch(e => { console.error(e); process.exit(1); });