agentic-studio/network-poc/tests/model-benchmark.mjs

#!/usr/bin/env node
/**
 * Kipinä Model Benchmark
 *
 * Generoi projekteja eri Ollama-malleilla ja testaa niiden toimivuus.
 * Käyttö:
 *   node model-benchmark.mjs                          # kaikki mallit, oletusskenaario
 *   node model-benchmark.mjs --models qwen3:8b,qwen3:30b
 *   node model-benchmark.mjs --ollama http://host:11434
 *   node model-benchmark.mjs --scenarios all          # kaikki skenaariot
 */

import { execSync } from 'child_process';
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';

const __dirname = dirname(fileURLToPath(import.meta.url));

// === CLI-argumentit ===
const args = process.argv.slice(2);
function arg(name, fallback) {
    const i = args.indexOf(`--${name}`);
    return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
}
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434');
const HUB_URL = arg('hub', '');  // Vaihtoehto: --hub https://kipina.studio
const FILTER_MODELS = arg('models', '');
const SCENARIO_FILTER = arg('scenarios', 'default');
const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark');
const MAX_FIX_ROUNDS = 2;

// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
function stripThinking(text) {
    return text
        .replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '')  // gemma4
        .replace(/<think>[\s\S]*?<\/think>/g, '')                // qwen3, qwen3.5
        .trim();
}

// === Ollama / Hub -client ===
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
    const start = Date.now();

    if (HUB_URL) {
        // Hub-reitti: /api/v1/chat/completions
        const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
        const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
        });
        if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
        const data = await resp.json();
        const elapsed = Date.now() - start;
        return {
            text: stripThinking((data.response || '').trim()),
            tokens: data.tokens_generated || 0,
            durationMs: elapsed,
            tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
        };
    }

    // Suora Ollama-reitti: /api/chat
    const messages = [];
    if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
    messages.push({ role: 'user', content: prompt });

    const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
            model,
            messages,
            stream: false,
            options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
        }),
    });
    if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
    const data = await resp.json();
    const elapsed = Date.now() - start;
    const text = stripThinking((data.message?.content || '').trim());
    const evalCount = data.eval_count || 0;
    const evalDurationNs = data.eval_duration || 1;
    const tokPerSec = evalCount / (evalDurationNs / 1e9);
    return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
}

async function ollamaListModels() {
    const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
    const resp = await fetch(url);
    if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
    const data = await resp.json();
    return (data.models || []).map(m => m.name);
}

// === Promptit (kopioitu index.astrosta) ===
const CLIENT_SYSTEM = `You are a product owner who turns vague ideas into clear, actionable software requirements.

GIVEN a short project description from the user, produce a structured brief:

1. PROJECT NAME: a short, descriptive name
2. GOAL: one sentence explaining what the software does and who it's for
3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes)
4. DATA MODEL: list the main entities and their key fields (include field types)
5. API ENDPOINTS: list the REST endpoints (method + path + purpose)
6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed")

RULES:
- Be specific: "User can filter todos by status" not "todo management"
- Use plain English, no code
- Maximum 400 words total`;

const SPEC_SYSTEM = `You are a software architect who designs database schemas for Python web applications.

THINK STEP BY STEP before outputting JSON:
1. What are the main ENTITIES (nouns) in this project?
2. What FIELDS does each entity need? (name, type, required?)
3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id)
4. Are there Date/DateTime fields? → add extra_imports

Then output ONLY valid JSON (no explanations before or after).

SCHEMA:
{"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]}

FIELD RULES:
- sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float
- py_type: str, int, float, bool, date, datetime — append " | None" if nullable
- Status fields: use String(20) with default value, NEVER Enum
- Every entity gets "id" automatically — do NOT add id or redundant ID fields
- Use snake_case for field names

RELATIONSHIP RULES:
- If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry
- EVERY _id field MUST have a matching relationship entry
- Parent entities must appear BEFORE children in the entities array
- If no relationships, set "relationships": []

AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish)

EXAMPLES (adapt, don't copy):
Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending")
Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")`;

const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.';

// === Kultainen esimerkki ===
const GOLDEN_DIR = join(__dirname, 'golden-examples', 'todo');
const GOLDEN_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
function loadGoldenExample() {
    if (!existsSync(GOLDEN_DIR)) return '';
    let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n';
    for (const f of GOLDEN_FILES) {
        const path = join(GOLDEN_DIR, f);
        if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
    }
    return example;
}
const GOLDEN_EXAMPLE = loadGoldenExample();

const CODE_SYSTEM = `You are a Python backend developer. Generate a FastAPI project with SQLAlchemy and SQLite.

Given the project requirements, JSON specification, and a REFERENCE IMPLEMENTATION, generate these 4 files:

1. models.py — SQLAlchemy 2.0: DeclarativeBase, Mapped, mapped_column (NOT legacy declarative_base)
2. schemas.py — Pydantic v2: ConfigDict(from_attributes=True) (NOT class Config)
3. main.py — FastAPI CRUD endpoints for each entity
4. test_main.py — Pytest with TestClient, separate test.db, unique test data per test

Do NOT generate pyproject.toml — it is created separately with uv.

OUTPUT FORMAT — use these exact markers to separate files:

=== models.py ===
<python code>

=== schemas.py ===
<python code>

=== main.py ===
<python code>

=== test_main.py ===
<python code>

DOCUMENTATION — every file must have a one-line module docstring. Classes get a one-line docstring. Keep it zensical: say what it IS, not what it does. No filler.

RULES:
- Follow the REFERENCE IMPLEMENTATION patterns exactly
- SQLAlchemy 2.0: DeclarativeBase + Mapped + mapped_column (not Column())
- Python type unions: str | None (not Optional[str])
- Tests: unique descriptive data per test, NOT generic "test_title" strings
- Absolute imports only (from models import ..., from schemas import ...)
- NO markdown fences inside file content — just raw code
- Only test endpoints that exist in main.py — no extra tests`;

// === Tiedostoparseri LLM-vastauksesta ===
function parseGeneratedFiles(text) {
    const files = {};
    const sections = text.split(/===\s*(\S+\.(?:py|toml))\s*===/);
    // sections: [preamble, filename1, content1, filename2, content2, ...]
    for (let i = 1; i < sections.length - 1; i += 2) {
        const name = sections[i];
        let content = sections[i + 1].trim();
        // Poista mahdolliset markdown-aidat
        content = content.replace(/^```(?:python|toml)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
        if (content) files[name] = content + '\n';
    }
    return files;
}

// === Validaattori ===
function validateProjectCode(files) {
    const issues = [];
    for (const [fname, code] of Object.entries(files)) {
        if (!fname.endsWith('.py')) continue;
        const lines = code.split('\n');
        for (const line of lines) {
            const m = line.match(/^from\s+\.(\w*)\s+import/);
            if (m) issues.push(`ISSUE: ${fname}: relatiivinen import`);
        }
        for (const line of lines) {
            const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
            if (!m) continue;
            const srcCode = files[m[1] + '.py'];
            if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
            const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
            for (const name of names) {
                if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
            }
        }
        if (fname === 'schemas.py') {
            if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
                issues.push('ISSUE: schemas.py: date-import puuttuu');
            if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
                issues.push('ISSUE: schemas.py: datetime-import puuttuu');
        }
        for (let i = 0; i < lines.length; i++) {
            const line = lines[i];
            if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
            if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
            if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
        }
    }
    return issues;
}

function extractJson(text) {
    const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
    if (m) text = m[1].trim();
    let depth = 0, start = null;
    for (let i = 0; i < text.length; i++) {
        if (text[i] === '{') { if (depth === 0) start = i; depth++; }
        else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
    }
    return null;
}

// === Testiskenaariot ===
const SCENARIOS = [
    { id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
    { id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
    { id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
];

// === Pipeline: yhdelle mallille ja skenaariolle ===
async function runPipeline(model, scenario) {
    const result = {
        model, scenario: scenario.id,
        reqOk: false, specOk: false, specEntities: 0,
        validationIssues: 0, fixRounds: 0,
        testsTotal: 0, testsPassed: 0, testsFailed: 0,
        totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
        error: null,
    };
    const timings = [];
    const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
    mkdirSync(dir, { recursive: true });

    try {
        // 1. Vaatimukset
        console.log(`    [1/5] Vaatimukset...`);
        const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 1024);
        timings.push(req);
        if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
        result.reqOk = true;
        writeFileSync(`${dir}/_requirements.txt`, req.text);

        // 2. JSON-speksi
        console.log(`    [2/5] JSON-speksi...`);
        const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 2048);
        timings.push(specResp);
        const spec = extractJson(specResp.text);
        if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
        result.specOk = true;
        result.specEntities = spec.entities.length;
        writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));

        // 3. LLM-koodigenerointi
        console.log(`    [3/5] Koodigenerointi (LLM)...`);
        const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all 5 files. Follow the reference implementation patterns exactly.`;
        const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, 8192);
        timings.push(codeResp);
        writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
        const files = parseGeneratedFiles(codeResp.text);
        const required = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
        const missing = required.filter(f => !files[f]);
        if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }

        // 4. Validointi + korjaussilmukka
        let issues = validateProjectCode(files);
        let fixRound = 0;
        while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
            fixRound++;
            console.log(`    [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
            const issuesByFile = {};
            for (const issue of issues) {
                const m = issue.match(/^ISSUE:\s*(\S+?):/);
                const fname = m ? m[1] : 'unknown';
                if (!issuesByFile[fname]) issuesByFile[fname] = [];
                issuesByFile[fname].push(issue);
            }
            for (const [fname, fIssues] of Object.entries(issuesByFile)) {
                if (!files[fname]) continue;
                const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
                const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
                timings.push(fixResp);
                if (fixResp.text) {
                    files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
                }
            }
            issues = validateProjectCode(files);
        }
        result.validationIssues = issues.length;
        result.fixRounds = fixRound;

        // 5. Projektin alustus (uv init) + kirjoita tiedostot + pytest
        console.log(`    [5/5] Pytest...`);
        try {
            const uvPath = process.env.HOME + '/.local/bin/uv';
            const uv = existsSync(uvPath) ? uvPath : 'uv';
            execSync(`cd "${dir}" && ${uv} init --no-readme --python ">=3.14" 2>/dev/null && rm -f hello.py main.py`, { timeout: 30000, stdio: 'pipe' });
            execSync(`cd "${dir}" && ${uv} add fastapi "uvicorn[standard]" sqlalchemy pytest httpx 2>/dev/null`, { timeout: 60000, stdio: 'pipe' });

            // Kirjoita LLM:n generoimat Python-tiedostot (uv initin jälkeen)
            for (const [fn, content] of Object.entries(files)) {
                if (fn.endsWith('.py')) writeFileSync(`${dir}/${fn}`, content);
            }
            execSync(`cd "${dir}" && rm -f app.db test.db`, { stdio: 'pipe' });
            const pytestOut = execSync(`cd "${dir}" && ${uv} run pytest test_main.py -v --tb=short 2>&1`, { timeout: 60000, encoding: 'utf-8' });
            writeFileSync(`${dir}/_pytest.txt`, pytestOut);

            const passedMatch = pytestOut.match(/(\d+) passed/);
            const failedMatch = pytestOut.match(/(\d+) failed/);
            result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
            result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0;
            result.testsTotal = result.testsPassed + result.testsFailed;
        } catch (e) {
            const output = e.stdout || e.stderr || e.message || '';
            writeFileSync(`${dir}/_pytest.txt`, output);
            const passedMatch = output.match(/(\d+) passed/);
            const failedMatch = output.match(/(\d+) failed/);
            const errorMatch = output.match(/(\d+) error/);
            result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
            result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0);
            result.testsTotal = result.testsPassed + result.testsFailed;
            if (result.testsTotal === 0) result.error = 'Pytest kaatui';
        }
    } catch (e) {
        result.error = e.message;
    }

    // Yhteenveto
    result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
    result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
    result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;

    return result;
}

// === Main ===
async function main() {
    console.log('╔══════════════════════════════════════════════╗');
    console.log('║       Kipinä Model Benchmark                ║');
    console.log('╚══════════════════════════════════════════════╝');
    console.log(`Ollama: ${OLLAMA_URL}`);

    // Haetaan mallit
    let models;
    try {
        models = await ollamaListModels();
    } catch (e) {
        console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
        process.exit(1);
    }

    if (FILTER_MODELS) {
        const filter = FILTER_MODELS.split(',').map(s => s.trim());
        models = models.filter(m => filter.some(f => m.includes(f)));
    }

    console.log(`Mallit (${models.length}): ${models.join(', ')}`);

    const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
    console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
    console.log(`Tulokset: ${OUTPUT_DIR}/`);
    console.log('');

    // Puhdista output
    rmSync(OUTPUT_DIR, { recursive: true, force: true });
    mkdirSync(OUTPUT_DIR, { recursive: true });

    const results = [];

    for (const model of models) {
        for (const scenario of scenarios) {
            console.log(`\n━━━ ${model} × ${scenario.id} ━━━`);
            const r = await runPipeline(model, scenario);
            results.push(r);

            const status = r.error ? `✗ ${r.error}` :
                r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
                `◐ ${r.testsPassed}/${r.testsTotal}`;
            console.log(`    → ${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s`);
        }
    }

    // === Tulostaulu ===
    console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
    console.log('║                                    TULOKSET                                                     ║');
    console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');

    const header = [
        'Malli'.padEnd(40),
        'Skenaario'.padEnd(10),
        'Speksi'.padEnd(8),
        'Testit'.padEnd(10),
        'Korjaus'.padEnd(8),
        'Aika'.padEnd(8),
        'tok/s'.padEnd(8),
        'Tulos',
    ].join(' │ ');
    console.log(`║ ${header} ║`);
    console.log('╠' + '═'.repeat(header.length + 2) + '╣');

    for (const r of results) {
        const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗';
        const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
        const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
        const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
        const speed = `${r.avgTokPerSec.toFixed(0)}`;
        const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL';

        const row = [
            r.model.padEnd(40),
            r.scenario.padEnd(10),
            specStatus.padEnd(8),
            testStatus.padEnd(10),
            fixStatus.padEnd(8),
            time.padEnd(8),
            speed.padEnd(8),
            verdict,
        ].join(' │ ');
        console.log(`║ ${row} ║`);
    }
    console.log('╚' + '═'.repeat(header.length + 2) + '╝');

    // Tallenna JSON
    writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
    console.log(`\nJSON: ${OUTPUT_DIR}/results.json`);

    // Yhteenveto
    const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
    const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
    const failed = results.filter(r => r.error || r.testsTotal === 0);
    console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
}

main().catch(e => { console.error(e); process.exit(1); });