agentic-studio/kipina-codebench/benchmark.mjs

#!/usr/bin/env node
/**
 * Kipinä CodeBench — LLM-koodingenerointibenchmark
 *
 * Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa.
 *
 * Käyttö:
 *   node benchmark.mjs                                    # kaikki mallit, oletusskenaario
 *   node benchmark.mjs --models qwen3-coder:30b           # yksi malli
 *   node benchmark.mjs --ollama http://host:11434          # eri Ollama
 *   node benchmark.mjs --scenarios all                    # kaikki skenaariot
 *   node benchmark.mjs --output ./results/run-001         # custom output-hakemisto
 */

import { execSync } from 'child_process';
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';

const __dirname = dirname(fileURLToPath(import.meta.url));

// === CLI-argumentit ===
const args = process.argv.slice(2);
function arg(name, fallback) {
    const i = args.indexOf(`--${name}`);
    return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
}
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://127.0.0.1:11434');
const HUB_URL = arg('hub', '');
const FILTER_MODELS = arg('models', '');
const SCENARIO_FILTER = arg('scenarios', 'default');
const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 16);
const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`);
const RESULTS_DIR = join(__dirname, 'results');
const THINK_MODE = args.includes('--think');
const COMPACT_MODE = args.includes('--compact');
const LANG = arg('lang', 'python');  // python | rust
const ROUNDS = parseInt(arg('rounds', '1'));  // 1-10 toistoa
const MAX_FIX_ROUNDS = 2;

// === Promptien lataus tiedostoista ===
function loadPrompt(name) {
    const path = join(__dirname, 'prompts', `${name}.md`);
    if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`);
    return readFileSync(path, 'utf-8').trim();
}
const CLIENT_SYSTEM = loadPrompt('client');
const SPEC_SYSTEM = loadPrompt('spec');
const FIX_SYSTEM = loadPrompt('fix');

// === Mallikohtaiset profiilit ===
const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8'));
function getCodePromptForModel(model) {
    const modelConf = PROFILES.models[model];
    const profile = modelConf?.profile || PROFILES.default_profile;
    const promptName = modelConf?.prompt || PROFILES.profiles[profile]?.prompt || 'code';
    const suffix = LANG === 'rust' ? '-rs' : '';
    // Yritä kielispesifistä ensin (code-small-rs), sitten perus (code-small)
    const candidates = [`${promptName}${suffix}`, promptName, `code${suffix}`, 'code'];
    for (const name of candidates) {
        const path = join(__dirname, 'prompts', `${name}.md`);
        if (existsSync(path)) return { system: readFileSync(path, 'utf-8').trim(), promptName: name, profile };
    }
    return { system: loadPrompt('code'), promptName: 'code', profile: 'large' };
}

// === Kultaisten esimerkkien lataus (kielen mukaan) ===
const GOLDEN_DIR = join(__dirname, 'golden-examples');
const LANG_CONFIG = {
    python: {
        goldenDir: 'todo',
        files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
        required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
        dockerImage: 'kipina-pytest',
    },
    rust: {
        goldenDir: 'todo-rs',
        files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
        required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
        dockerImage: 'kipina-cargo-test',
    },
};
const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python;

function loadGoldenExample() {
    // --compact: käytä tiivistettyä templaattia
    if (COMPACT_MODE) {
        const compactFile = LANG === 'rust' ? 'golden-compact-rs.md' : 'golden-compact-py.md';
        const compactPath = join(__dirname, 'prompts', compactFile);
        if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n';
    }
    // Markdown golden example (koodi + selitykset)
    const mdName = LANG === 'rust' ? 'todo-rs.md' : 'todo.md';
    const mdPath = join(GOLDEN_DIR, mdName);
    if (existsSync(mdPath)) return '\n' + readFileSync(mdPath, 'utf-8').trim() + '\n';
    // Fallback: erilliset tiedostot
    const todoDir = join(GOLDEN_DIR, LCONF.goldenDir);
    if (!existsSync(todoDir)) return '';
    let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`;
    for (const f of LCONF.files) {
        const path = join(todoDir, f);
        if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
    }
    return example;
}
const GOLDEN_EXAMPLE = loadGoldenExample();

// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
function stripThinking(text) {
    return text
        .replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '')  // gemma4
        .replace(/<think>[\s\S]*?<\/think>/g, '')                // qwen3, qwen3.5
        .trim();
}

// === Ollama / Hub -client ===
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
    const start = Date.now();

    if (HUB_URL) {
        const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
        const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
        });
        if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
        const data = await resp.json();
        const elapsed = Date.now() - start;
        return {
            text: stripThinking((data.response || '').trim()),
            tokens: data.tokens_generated || 0,
            durationMs: elapsed,
            tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
        };
    }

    // Suora Ollama-reitti
    const messages = [];
    if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
    messages.push({ role: 'user', content: prompt });

    const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
            model,
            messages,
            stream: false,
            think: THINK_MODE,
            options: { num_predict: THINK_MODE ? maxTokens * 3 : maxTokens, num_ctx: 16384, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
        }),
    });
    if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
    const data = await resp.json();
    const elapsed = Date.now() - start;
    const rawContent = (data.message?.content || '').trim();
    const thinking = (data.message?.thinking || '').trim();
    const text = stripThinking(rawContent || thinking);
    const evalCount = data.eval_count || 0;
    if (!rawContent && thinking) console.log(`      ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`);
    const evalDurationNs = data.eval_duration || 1;
    const tokPerSec = evalCount / (evalDurationNs / 1e9);
    return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
}

async function ollamaListModels() {
    const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
    const resp = await fetch(url);
    if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
    const data = await resp.json();
    return (data.models || []).map(m => m.name);
}

// === Testitulosten parsinta (pytest + cargo test) ===
function parseTestOutput(output) {
    // Pytest: "6 passed", "2 failed", "1 error"
    const pyPassed = output.match(/(\d+) passed/);
    const pyFailed = output.match(/(\d+) failed/);
    const pyError = output.match(/(\d+) error/);
    if (pyPassed || pyFailed) {
        const passed = pyPassed ? parseInt(pyPassed[1]) : 0;
        const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0);
        return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
    }
    // Cargo test: "test result: ok. 10 passed; 0 failed;"
    const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/);
    if (cargoMatch) {
        const passed = parseInt(cargoMatch[1]);
        const failed = parseInt(cargoMatch[2]);
        return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
    }
    // Cargo compilation error: count "error[E" occurrences
    const compileErrors = (output.match(/error\[E\d+\]/g) || []).length;
    if (compileErrors > 0) {
        return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors };
    }
    return { testsPassed: 0, testsFailed: 0, testsTotal: 0 };
}

// === Tiedostoparseri LLM-vastauksesta ===
function parseGeneratedFiles(text) {
    const files = {};
    const sections = text.split(/===\s*(\S+\.(?:py|toml|rs))\s*===/);
    for (let i = 1; i < sections.length - 1; i += 2) {
        const name = sections[i];
        let content = sections[i + 1].trim();
        content = content.replace(/^```(?:python|toml|rust)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
        if (content) files[name] = content + '\n';
    }
    return files;
}

// === Validaattori ===
function validateProjectCode(files) {
    const issues = [];
    for (const [fname, code] of Object.entries(files)) {
        if (!fname.endsWith('.py')) continue;
        const lines = code.split('\n');
        for (const line of lines) {
            if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`);
        }
        for (const line of lines) {
            const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
            if (!m) continue;
            const srcCode = files[m[1] + '.py'];
            if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
            const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
            for (const name of names) {
                if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
            }
        }
        if (fname === 'schemas.py') {
            if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
                issues.push('ISSUE: schemas.py: date-import puuttuu');
            if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
                issues.push('ISSUE: schemas.py: datetime-import puuttuu');
        }
        for (let i = 0; i < lines.length; i++) {
            const line = lines[i];
            if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
            if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
            if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
        }
    }
    return issues;
}

function extractJson(text) {
    const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
    if (m) text = m[1].trim();
    let depth = 0, start = null;
    for (let i = 0; i < text.length; i++) {
        if (text[i] === '{') { if (depth === 0) start = i; depth++; }
        else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
    }
    return null;
}

// === Testiskenaariot ===
const SCENARIOS = [
    { id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
    { id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
    { id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
];

// === Pisteytys (0–100) ja tähtiluokitus ===
function scoreResult(r) {
    if (r.error && r.testsTotal === 0) return 0;
    let score = 0;
    if (r.specOk) score += 10;
    if (!r.error || r.testsTotal > 0) score += 10;
    if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
    score += Math.max(0, 20 - r.fixRounds * 10);
    return Math.min(100, score);
}
function starsForScore(score) {
    if (score >= 90) return '★★★★★';
    if (score >= 70) return '★★★★☆';
    if (score >= 50) return '★★★☆☆';
    if (score >= 25) return '★★☆☆☆';
    if (score > 0)   return '★☆☆☆☆';
    return '☆☆☆☆☆';
}

// === Pipeline: yhdelle mallille ja skenaariolle ===
async function runPipeline(model, scenario) {
    const result = {
        model, scenario: scenario.id,
        reqOk: false, specOk: false, specEntities: 0,
        validationIssues: 0, fixRounds: 0,
        testsTotal: 0, testsPassed: 0, testsFailed: 0,
        totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
        promptChars: 0, promptTokensEst: 0,
        score: 0, stars: '',
        error: null,
    };
    const timings = [];
    const { system: CODE_SYSTEM, promptName, profile } = getCodePromptForModel(model);
    const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
    mkdirSync(dir, { recursive: true });

    try {
        // 1. Vaatimukset
        console.log(`    [1/5] Vaatimukset...`);
        const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 2048);
        timings.push(req);
        if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
        result.reqOk = true;
        writeFileSync(`${dir}/_requirements.txt`, req.text);

        // 2. JSON-speksi
        console.log(`    [2/5] JSON-speksi...`);
        const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 4096);
        timings.push(specResp);
        const spec = extractJson(specResp.text);
        if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
        result.specOk = true;
        result.specEntities = spec.entities.length;
        writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));

        // 3. LLM-koodigenerointi
        console.log(`    [3/5] Koodigenerointi (LLM)...`);
        const fileCount = LCONF.required.length;
        const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`;
        result.promptChars = CODE_SYSTEM.length + codePrompt.length;
        result.promptTokensEst = Math.round(result.promptChars / 4);
        const codeTokens = LANG === 'rust' ? 12288 : 8192;
        const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens);
        timings.push(codeResp);
        writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
        const files = parseGeneratedFiles(codeResp.text);
        const missing = LCONF.required.filter(f => !files[f]);
        if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }

        // 4. Validointi + korjaussilmukka (Python-spesifi)
        let fixRound = 0;
        if (LANG === 'python') {
            let issues = validateProjectCode(files);
            while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
                fixRound++;
                console.log(`    [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
                const issuesByFile = {};
                for (const issue of issues) {
                    const m = issue.match(/^ISSUE:\s*(\S+?):/);
                    const fname = m ? m[1] : 'unknown';
                    if (!issuesByFile[fname]) issuesByFile[fname] = [];
                    issuesByFile[fname].push(issue);
                }
                for (const [fname, fIssues] of Object.entries(issuesByFile)) {
                    if (!files[fname]) continue;
                    const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
                    const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
                    timings.push(fixResp);
                    if (fixResp.text) {
                        files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
                    }
                }
                issues = validateProjectCode(files);
            }
            result.validationIssues = issues.length;
        }
        result.fixRounds = fixRound;

        // 5. Testit Docker-kontissa + itsekorjaava looppi (Taso 4)
        const testLabel = LANG === 'rust' ? 'Cargo test' : 'Pytest';
        const dockerTimeout = LANG === 'rust' ? 300000 : 120000;
        const MAX_TEST_FIX = 3;

        for (let testRound = 0; testRound <= MAX_TEST_FIX; testRound++) {
            // Kirjoita tiedostot levylle
            for (const [fn, content] of Object.entries(files)) {
                const filePath = join(dir, fn);
                mkdirSync(dirname(filePath), { recursive: true });
                writeFileSync(filePath, content);
            }

            // Nopea staattinen analyysi ennen Docker-ajoa
            const pyFiles = Object.keys(files).filter(f => f.endsWith('.py'));
            if (LANG === 'python' && pyFiles.length > 0) {
                let syntaxErrors = '';
                for (const f of pyFiles) {
                    try {
                        execSync(`python3 -c "import py_compile; py_compile.compile('${join(dir, f)}', doraise=True)"`, { timeout: 5000, encoding: 'utf-8', stdio: 'pipe' });
                    } catch (e) {
                        syntaxErrors += `${f}: ${(e.stderr || e.message || '').split('\n').filter(l => l.includes('Error')).join('; ')}\n`;
                    }
                }
                if (syntaxErrors) {
                    console.log(`    [5/5] ⚠ Syntaksivirhe — ohitetaan Docker`);
                    // Suoraan itsekorjaukseen ilman Docker-ajoa
                    writeFileSync(`${dir}/_testout_${testRound}.txt`, `SYNTAX ERRORS:\n${syntaxErrors}`);
                    Object.assign(result, { testsPassed: 0, testsFailed: 1, testsTotal: 1 });

                    if (testRound >= MAX_TEST_FIX) { result.error = 'Syntaksivirhe'; break; }

                    console.log(`    [5/5] Itsekorjaus: syntaksi...`);
                    const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
                    const fixPrompt = `Fix the following syntax errors. Return ALL files with === markers.\n\nERRORS:\n${syntaxErrors}\n\nCURRENT CODE:\n${allCode}`;
                    const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 8192);
                    timings.push(fixResp);
                    const fixedFiles = parseGeneratedFiles(fixResp.text);
                    for (const [fn, content] of Object.entries(fixedFiles)) {
                        if (LCONF.required.includes(fn)) files[fn] = content;
                    }
                    result.fixRounds++;
                    continue;  // Aja uudestaan
                }
            }

            const roundLabel = testRound > 0 ? ` (korjaus ${testRound}/${MAX_TEST_FIX})` : '';
            console.log(`    [5/5] ${testLabel}${roundLabel}...`);

            let testOut = '';
            try {
                testOut = execSync(
                    `docker run --rm -v "${dir}:/src:ro" ${LCONF.dockerImage} 2>&1`,
                    { timeout: dockerTimeout, encoding: 'utf-8' }
                );
            } catch (e) {
                testOut = e.stdout || e.stderr || e.message || '';
            }
            writeFileSync(`${dir}/_testout_${testRound}.txt`, testOut);
            Object.assign(result, parseTestOutput(testOut));

            // Kaikki testit läpi → valmis
            if (result.testsTotal > 0 && result.testsPassed === result.testsTotal) break;

            // Viimeinen kierros tai ei enää korjausmahdollisuutta
            if (testRound >= MAX_TEST_FIX) {
                if (result.testsTotal === 0) result.error = 'Testit kaatuivat';
                break;
            }

            // Itsekorjaus: syötä virhe + koodi mallille
            const errorLines = testOut.split('\n').filter(l => /^E |FAILED|ERROR|error\[E/.test(l)).slice(0, 20).join('\n');
            if (!errorLines) break;  // Ei parsittavia virheitä

            console.log(`    [5/5] Itsekorjaus: ${result.testsFailed || 'virhe'}...`);
            const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
            const fixPrompt = `The following test errors occurred. Fix the code so ALL tests pass. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`;
            const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, LANG === 'rust' ? 12288 : 8192);
            timings.push(fixResp);

            const fixedFiles = parseGeneratedFiles(fixResp.text);
            // Päivitä vain tiedostot jotka malli palautti
            for (const [fn, content] of Object.entries(fixedFiles)) {
                if (LCONF.required.includes(fn)) files[fn] = content;
            }
            result.fixRounds++;
        }
        writeFileSync(`${dir}/_testout.txt`, ''); // Symlink viimeisimpään
    } catch (e) {
        result.error = e.message;
    }

    // Yhteenveto
    result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
    result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
    result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
    result.score = scoreResult(result);
    result.stars = starsForScore(result.score);
    result.profile = profile;
    result.promptName = promptName;

    return result;
}

// === Main ===
async function main() {
    console.log('╔══════════════════════════════════════════════╗');
    console.log('║       Kipinä CodeBench                      ║');
    console.log('╚══════════════════════════════════════════════╝');
    console.log(`Ollama: ${OLLAMA_URL}  📝 ${LANG}${COMPACT_MODE ? ' (compact)' : ''}${THINK_MODE ? '  🧠 thinking ON' : ''}`);

    // Haetaan mallit
    let models;
    try {
        models = await ollamaListModels();
    } catch (e) {
        console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
        process.exit(1);
    }

    // Tyhjennä VRAM — vapauta kaikki ladatut mallit
    try {
        const psResp = await fetch(`${OLLAMA_URL}/api/ps`);
        const psData = await psResp.json();
        for (const m of (psData.models || [])) {
            await fetch(`${OLLAMA_URL}/api/generate`, {
                method: 'POST',
                headers: { 'Content-Type': 'application/json' },
                body: JSON.stringify({ model: m.name, keep_alive: 0 }),
            });
            console.log(`  ♻ Vapautettu: ${m.name}`);
        }
    } catch (e) { /* ei kriittinen */ }

    if (FILTER_MODELS) {
        const filter = FILTER_MODELS.split(',').map(s => s.trim());
        models = models.filter(m => filter.some(f => m.includes(f)));
    }

    console.log(`Mallit (${models.length}): ${models.join(', ')}`);

    const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS :
        SCENARIOS.filter(s => s.id === SCENARIO_FILTER).length > 0 ? SCENARIOS.filter(s => s.id === SCENARIO_FILTER) :
        [SCENARIOS[0]];
    console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
    if (ROUNDS > 1) console.log(`Toistoja: ${ROUNDS}`);
    console.log(`Tulokset: ${OUTPUT_DIR}/`);
    console.log('');

    // Puhdista output
    rmSync(OUTPUT_DIR, { recursive: true, force: true });
    mkdirSync(OUTPUT_DIR, { recursive: true });

    const results = [];

    for (let round = 1; round <= ROUNDS; round++) {
    if (ROUNDS > 1) console.log(`\n╔═══ Kierros ${round}/${ROUNDS} ═══╗`);
    for (const model of models) {
        for (const scenario of scenarios) {
            const roundLabel = ROUNDS > 1 ? ` [${round}/${ROUNDS}]` : '';
            console.log(`\n━━━ ${model} × ${scenario.id}${roundLabel} ━━━`);
            const r = await runPipeline(model, scenario);
            if (ROUNDS > 1) r.round = round;
            results.push(r);

            const status = r.error ? `✗ ${r.error}` :
                r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
                `◐ ${r.testsPassed}/${r.testsTotal}`;
            const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
            console.log(`    → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
        }
    }
    // Väliraportti kierroksen jälkeen
    if (ROUNDS > 1) {
        const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
        console.log(`\n┌─── Tilanne kierroksen ${round}/${ROUNDS} jälkeen ───┐`);
        for (const model of [...new Set(results.map(r => r.model))]) {
            const mrs = results.filter(r => r.model === model);
            for (const sid of scenarios.map(s => s.id)) {
                const runs = mrs.filter(r => r.scenario === sid);
                if (runs.length === 0) continue;
                const scores = runs.map(r => r.score);
                const med = median(scores);
                const last = scores[scores.length - 1];
                const trend = scores.length > 1 ? (last > scores[scores.length - 2] ? '▲' : last < scores[scores.length - 2] ? '▼' : '─') : '';
                console.log(`│ ${model.padEnd(28)} ${sid.padEnd(7)} ${starsForScore(med)} med:${String(med).padStart(3)}p  [${scores.join(',')}] ${trend}`);
            }
        }
        console.log(`└${'─'.repeat(45)}┘`);
    }
    } // rounds

    // === Tulostaulu ===
    console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
    console.log('║                                    TULOKSET                                                     ║');
    console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');

    const header = [
        'Malli'.padEnd(40),
        'Skenaario'.padEnd(10),
        'Speksi'.padEnd(8),
        'Testit'.padEnd(10),
        'Korjaus'.padEnd(8),
        'Ctx'.padEnd(7),
        'Aika'.padEnd(8),
        'tok/s'.padEnd(8),
        'Pisteet',
    ].join(' │ ');
    console.log(`║ ${header} ║`);
    console.log('╠' + '═'.repeat(header.length + 2) + '╣');

    for (const r of results) {
        const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗';
        const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
        const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
        const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
        const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
        const speed = `${r.avgTokPerSec.toFixed(0)}`;
        const row = [
            r.model.padEnd(40),
            r.scenario.padEnd(10),
            specStatus.padEnd(8),
            testStatus.padEnd(10),
            fixStatus.padEnd(8),
            ctx.padEnd(7),
            time.padEnd(8),
            speed.padEnd(8),
            `${r.stars} ${r.score}`,
        ].join(' │ ');
        console.log(`║ ${row} ║`);
    }
    console.log('╚' + '═'.repeat(header.length + 2) + '╝');

    // === Mallikohtainen yhteenveto ===
    const modelNames = [...new Set(results.map(r => r.model))];
    const scenarioIds = scenarios.map(s => s.id);

    console.log('\n');
    const mHeader = [
        'Malli'.padEnd(35),
        ...scenarioIds.map(s => s.padEnd(22)),
        'Yht.'.padEnd(8),
        'Out'.padEnd(7),
        'Aika'.padEnd(8),
        'tok/s'.padEnd(7),
        'Pisteet',
    ].join(' │ ');
    console.log(mHeader);
    console.log('─'.repeat(mHeader.length));

    for (const model of modelNames) {
        const mrs = results.filter(r => r.model === model);
        const cols = scenarioIds.map(sid => {
            const r = mrs.find(r => r.scenario === sid);
            if (!r) return '-'.padEnd(22);
            const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
            const s = `${(r.totalDurationMs/1000).toFixed(0)}s`;
            const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`;
            return `${t} ${s} ${tok}`.padEnd(22);
        });
        const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0);
        const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0);
        const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0);
        const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0);
        const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0;
        const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0;
        const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0;
        const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`;
        const row = [
            model.padEnd(35),
            ...cols,
            `${totalPassed}/${totalTests}`.padEnd(8),
            tokStr.padEnd(7),
            `${(totalTime/1000).toFixed(0)}s`.padEnd(8),
            `${avgSpeed}`.padEnd(7),
            `${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`,
        ].join(' │ ');
        console.log(row);
    }

    // Tallenna JSON + HTML-raportti
    const jsonData = JSON.stringify(results, null, 2);
    writeFileSync(`${OUTPUT_DIR}/results.json`, jsonData);
    const templatePath = join(__dirname, 'report-template.html');
    let htmlData = '';
    if (existsSync(templatePath)) {
        htmlData = readFileSync(templatePath, 'utf-8').replace('/*DATA_PLACEHOLDER*/[]', JSON.stringify(results));
        writeFileSync(`${OUTPUT_DIR}/report.html`, htmlData);
        console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`);
    }
    console.log(`JSON: ${OUTPUT_DIR}/results.json`);

    // Kopioi results/-kansioon aikaleimalla
    mkdirSync(RESULTS_DIR, { recursive: true });
    writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.json`), jsonData);
    if (htmlData) writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.html`), htmlData);
    console.log(`Arkistoitu: results/${TIMESTAMP}.json`);

    // Yhteenveto
    const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
    const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
    const failed = results.filter(r => r.error || r.testsTotal === 0);
    const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
    const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
    console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);

    // === Kierrosyhteenveto (kun rounds > 1) ===
    if (ROUNDS > 1) {
        console.log('\n\n╔══════════════════════════════════════════════╗');
        console.log('║       KIERROSYHTEENVETO (mediaani)            ║');
        console.log('╚══════════════════════════════════════════════╝\n');
        const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };

        for (const model of modelNames) {
            const mrs = results.filter(r => r.model === model);
            for (const sid of scenarioIds) {
                const runs = mrs.filter(r => r.scenario === sid);
                if (runs.length === 0) continue;
                const scores = runs.map(r => r.score);
                const med = median(scores);
                const min = Math.min(...scores);
                const max = Math.max(...scores);
                const passRates = runs.map(r => r.testsTotal > 0 ? Math.round(r.testsPassed/r.testsTotal*100) : 0);
                console.log(`${model.padEnd(30)} ${sid.padEnd(8)} ${starsForScore(med)} med:${med}p  min:${min} max:${max}  pass:[${passRates.join(',')}]%`);
            }
        }
    }
}

main().catch(e => { console.error(e); process.exit(1); });