From 20cea8f268692340190ba57449dd82181016f54c Mon Sep 17 00:00:00 2001 From: jaakko Date: Mon, 13 Apr 2026 22:08:47 +0300 Subject: [PATCH] =?UTF-8?q?Model=20benchmark:=20testaa=20kaikki=20Ollama-m?= =?UTF-8?q?allit=20j=C3=A4rjestelm=C3=A4llisesti?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ajaa täyden pipeline-kierroksen per malli × skenaario: 1. Client-prompti → vaatimukset 2. Manager/SPEC_SYSTEM → JSON-speksi 3. Template-generointi → koodi 4. Validointi + LLM-korjaussilmukka 5. uv sync + pytest Tuottaa vertailutaulukon: speksin laatu, testien tulos, nopeus. Tukee suoraa Ollamaa (--ollama) ja hub-reittiä (--hub). --- network-poc/tests/model-benchmark.mjs | 513 ++++++++++++++++++++++++++ 1 file changed, 513 insertions(+) create mode 100644 network-poc/tests/model-benchmark.mjs diff --git a/network-poc/tests/model-benchmark.mjs b/network-poc/tests/model-benchmark.mjs new file mode 100644 index 0000000..6180dfd --- /dev/null +++ b/network-poc/tests/model-benchmark.mjs @@ -0,0 +1,513 @@ +#!/usr/bin/env node +/** + * Kipinä Model Benchmark + * + * Generoi projekteja eri Ollama-malleilla ja testaa niiden toimivuus. + * Käyttö: + * node model-benchmark.mjs # kaikki mallit, oletusskenaario + * node model-benchmark.mjs --models qwen3:8b,qwen3:30b + * node model-benchmark.mjs --ollama http://host:11434 + * node model-benchmark.mjs --scenarios all # kaikki skenaariot + */ + +import { execSync } from 'child_process'; +import { writeFileSync, mkdirSync, rmSync, existsSync } from 'fs'; + +// === CLI-argumentit === +const args = process.argv.slice(2); +function arg(name, fallback) { + const i = args.indexOf(`--${name}`); + return i >= 0 && args[i + 1] ? args[i + 1] : fallback; +} +const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434'); +const HUB_URL = arg('hub', ''); // Vaihtoehto: --hub https://kipina.studio +const FILTER_MODELS = arg('models', ''); +const SCENARIO_FILTER = arg('scenarios', 'default'); +const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark'); +const MAX_FIX_ROUNDS = 2; + +// === Ollama / Hub -client === +async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) { + const start = Date.now(); + + if (HUB_URL) { + // Hub-reitti: /api/v1/chat/completions + const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`; + const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }), + }); + if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`); + const data = await resp.json(); + const elapsed = Date.now() - start; + return { + text: (data.response || '').trim(), + tokens: data.tokens_generated || 0, + durationMs: elapsed, + tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000), + }; + } + + // Suora Ollama-reitti: /api/chat + const messages = []; + if (systemPrompt) messages.push({ role: 'system', content: systemPrompt }); + messages.push({ role: 'user', content: prompt }); + + const resp = await fetch(`${OLLAMA_URL}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model, + messages, + stream: false, + options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 }, + }), + }); + if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`); + const data = await resp.json(); + const elapsed = Date.now() - start; + const text = (data.message?.content || '').trim(); + const evalCount = data.eval_count || 0; + const evalDurationNs = data.eval_duration || 1; + const tokPerSec = evalCount / (evalDurationNs / 1e9); + return { text, tokens: evalCount, durationMs: elapsed, tokPerSec }; +} + +async function ollamaListModels() { + const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`; + const resp = await fetch(url); + if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`); + const data = await resp.json(); + return (data.models || []).map(m => m.name); +} + +// === Promptit (kopioitu index.astrosta) === +const CLIENT_SYSTEM = `You are a product owner who turns vague ideas into clear, actionable software requirements. + +GIVEN a short project description from the user, produce a structured brief: + +1. PROJECT NAME: a short, descriptive name +2. GOAL: one sentence explaining what the software does and who it's for +3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes) +4. DATA MODEL: list the main entities and their key fields (include field types) +5. API ENDPOINTS: list the REST endpoints (method + path + purpose) +6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed") + +RULES: +- Be specific: "User can filter todos by status" not "todo management" +- Use plain English, no code +- Maximum 400 words total`; + +const SPEC_SYSTEM = `You are a software architect who designs database schemas for Python web applications. + +THINK STEP BY STEP before outputting JSON: +1. What are the main ENTITIES (nouns) in this project? +2. What FIELDS does each entity need? (name, type, required?) +3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id) +4. Are there Date/DateTime fields? → add extra_imports + +Then output ONLY valid JSON (no explanations before or after). + +SCHEMA: +{"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]} + +FIELD RULES: +- sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float +- py_type: str, int, float, bool, date, datetime — append " | None" if nullable +- Status fields: use String(20) with default value, NEVER Enum +- Every entity gets "id" automatically — do NOT add id or redundant ID fields +- Use snake_case for field names + +RELATIONSHIP RULES: +- If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry +- EVERY _id field MUST have a matching relationship entry +- Parent entities must appear BEFORE children in the entities array +- If no relationships, set "relationships": [] + +AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish) + +EXAMPLES (adapt, don't copy): +Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending") +Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")`; + +const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.'; + +// === Template-funktiot (kopioitu korjatusta index.astrosta) === +function pyLiteral(val) { + if (val === true) return 'True'; + if (val === false) return 'False'; + if (val === null || val === undefined) return 'None'; + if (typeof val === 'string') return `"${val}"`; + return String(val); +} +function pyJsonLiteral(obj) { + const parts = Object.entries(obj).map(([k, v]) => { + let pyVal; + if (v === true) pyVal = 'True'; else if (v === false) pyVal = 'False'; + else if (v === null) pyVal = 'None'; else if (typeof v === 'string') pyVal = `"${v}"`; + else pyVal = String(v); + return `"${k}":${pyVal}`; + }); + return '{' + parts.join(',') + '}'; +} +function tmplModels(spec) { + const saTypes = new Set(['Integer']); + for (const e of spec.entities) for (const f of e.fields) saTypes.add(f.sa_type.match(/^(\w+)/)[1]); + const relMap = {}; + for (const r of (spec.relationships || [])) { + const target = spec.entities.find(e => e.name === r.to); + if (target) relMap[`${r.from}.${r.field}`] = target.table_name; + } + if (Object.keys(relMap).length > 0) saTypes.add('ForeignKey'); + const imports = [...saTypes].sort().join(', '); + let code = `from sqlalchemy import create_engine, Column, ${imports}\nfrom sqlalchemy.orm import declarative_base, sessionmaker\n\nDATABASE_URL = "sqlite:///./app.db"\nengine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})\nSessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)\nBase = declarative_base()\n\n`; + for (const e of spec.entities) { + code += `class ${e.name}(Base):\n __tablename__ = "${e.table_name}"\n id = Column(Integer, primary_key=True, index=True)\n`; + for (const f of e.fields) { + const fkTarget = relMap[`${e.name}.${f.name}`]; + let parts = fkTarget ? [`Column(${f.sa_type}, ForeignKey("${fkTarget}.id")`] : [`Column(${f.sa_type}`]; + if (!f.nullable) parts.push('nullable=False'); + if (f.default !== null && f.default !== undefined) parts.push(`default=${pyLiteral(f.default)}`); + code += ` ${f.name} = ${parts.join(', ')})\n`; + } + code += '\n'; + } + code += 'Base.metadata.create_all(bind=engine)\n'; + return code; +} +function tmplSchemas(spec) { + const dtTypes = new Set(); + for (const e of spec.entities) for (const f of e.fields) { + if (/\bdate\b/i.test(f.py_type) && !/datetime/.test(f.py_type)) dtTypes.add('date'); + if (/\bdatetime\b/i.test(f.py_type)) dtTypes.add('datetime'); + } + let code = 'from pydantic import BaseModel, ConfigDict\n'; + if (dtTypes.size > 0) code += `from datetime import ${[...dtTypes].sort().join(', ')}\n`; + for (const imp of (spec.extra_imports || [])) { + if (/^(date|datetime)$/.test(imp.trim())) continue; + if (/^from\s/.test(imp) || /^import\s/.test(imp)) code += imp + '\n'; + } + code += '\n'; + for (const e of spec.entities) { + code += `class ${e.name}Create(BaseModel):\n`; + for (const f of e.fields) { + if (f.default !== null && f.default !== undefined) code += ` ${f.name}: ${f.py_type} = ${pyLiteral(f.default)}\n`; + else if (f.nullable && f.py_type.includes('None')) code += ` ${f.name}: ${f.py_type} = None\n`; + else code += ` ${f.name}: ${f.py_type}\n`; + } + code += `\nclass ${e.name}Response(${e.name}Create):\n id: int\n model_config = ConfigDict(from_attributes=True)\n\n`; + } + return code; +} +function tmplMain(spec) { + const modelNames = spec.entities.map(e => e.name).join(', '); + const createNames = spec.entities.map(e => e.name+'Create').join(', '); + const responseNames = spec.entities.map(e => e.name+'Response').join(', '); + let code = `from fastapi import FastAPI, Depends, HTTPException\nfrom sqlalchemy.orm import Session\nfrom models import Base, engine, SessionLocal, ${modelNames}\nfrom schemas import ${createNames}, ${responseNames}\n\napp = FastAPI()\n\ndef get_db():\n db = SessionLocal()\n try:\n yield db\n finally:\n db.close()\n\n`; + for (const e of spec.entities) { + const lo = e.name.toLowerCase(), tb = e.table_name; + code += `@app.post("/${tb}/", response_model=${e.name}Response, status_code=201)\ndef create_${lo}(item: ${e.name}Create, db: Session = Depends(get_db)):\n db_item = ${e.name}(**item.model_dump())\n db.add(db_item)\n db.commit()\n db.refresh(db_item)\n return db_item\n\n`; + code += `@app.get("/${tb}/", response_model=list[${e.name}Response])\ndef list_${lo}s(db: Session = Depends(get_db)):\n return db.query(${e.name}).all()\n\n`; + code += `@app.get("/${tb}/{item_id}", response_model=${e.name}Response)\ndef get_${lo}(item_id: int, db: Session = Depends(get_db)):\n item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n if not item:\n raise HTTPException(status_code=404, detail="${e.name} not found")\n return item\n\n`; + code += `@app.put("/${tb}/{item_id}", response_model=${e.name}Response)\ndef update_${lo}(item_id: int, item: ${e.name}Create, db: Session = Depends(get_db)):\n db_item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n if not db_item:\n raise HTTPException(status_code=404, detail="${e.name} not found")\n for key, value in item.model_dump().items():\n setattr(db_item, key, value)\n db.commit()\n db.refresh(db_item)\n return db_item\n\n`; + code += `@app.delete("/${tb}/{item_id}", status_code=204)\ndef delete_${lo}(item_id: int, db: Session = Depends(get_db)):\n db_item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n if not db_item:\n raise HTTPException(status_code=404, detail="${e.name} not found")\n db.delete(db_item)\n db.commit()\n\n`; + } + return code; +} +function tmplTests(spec) { + let code = `from fastapi.testclient import TestClient\nfrom sqlalchemy import create_engine\nfrom sqlalchemy.orm import sessionmaker\nfrom main import app, get_db\nfrom models import Base\n\nTEST_DB = "sqlite:///./test.db"\ntest_engine = create_engine(TEST_DB, connect_args={"check_same_thread": False})\nTestSession = sessionmaker(autocommit=False, autoflush=False, bind=test_engine)\nBase.metadata.create_all(bind=test_engine)\n\ndef override_get_db():\n db = TestSession()\n try:\n yield db\n finally:\n db.close()\n\napp.dependency_overrides[get_db] = override_get_db\nclient = TestClient(app)\n\n`; + for (const e of spec.entities) { + const lo = e.name.toLowerCase(), tb = e.table_name; + const testData = {}; + for (const f of e.fields) { + if (f.default !== null && f.default !== undefined) { testData[f.name] = f.default; continue; } + if (f.py_type.includes('str')) testData[f.name] = `Test ${f.name}`; + else if (f.py_type.includes('int')) testData[f.name] = 1; + else if (f.py_type.includes('float')) testData[f.name] = 1.0; + else if (f.py_type.includes('bool')) testData[f.name] = true; + else if (f.py_type.includes('date')) testData[f.name] = '2024-01-15'; + } + const td = pyJsonLiteral(testData); + const firstStr = e.fields.find(f => f.py_type.includes('str') && f.name !== 'status'); + const updateData = {...testData}; + if (firstStr) updateData[firstStr.name] = `Updated ${firstStr.name}`; + const ud = pyJsonLiteral(updateData); + code += `def test_create_${lo}():\n response = client.post('/${tb}/', json=${td})\n assert response.status_code == 201\n assert 'id' in response.json()\n\n`; + code += `def test_list_${lo}s():\n client.post('/${tb}/', json=${td})\n response = client.get('/${tb}/')\n assert response.status_code == 200\n assert len(response.json()) >= 1\n\n`; + code += `def test_get_${lo}_by_id():\n created = client.post('/${tb}/', json=${td}).json()\n item_id = created['id']\n response = client.get(f'/${tb}/{item_id}')\n assert response.status_code == 200\n assert response.json()['id'] == item_id\n\n`; + code += `def test_get_${lo}_not_found():\n response = client.get('/${tb}/99999')\n assert response.status_code == 404\n\n`; + code += `def test_update_${lo}():\n created = client.post('/${tb}/', json=${td}).json()\n item_id = created['id']\n response = client.put(f'/${tb}/{item_id}', json=${ud})\n assert response.status_code == 200\n\n`; + code += `def test_delete_${lo}():\n created = client.post('/${tb}/', json=${td}).json()\n item_id = created['id']\n response = client.delete(f'/${tb}/{item_id}')\n assert response.status_code == 204\n response = client.get(f'/${tb}/{item_id}')\n assert response.status_code == 404\n\n`; + } + return code; +} +function tmplPyproject(spec) { + const name = (spec.project_name || 'app').toLowerCase().replace(/\s+/g, '-'); + return `[project]\nname = "${name}"\nversion = "0.1.0"\nrequires-python = ">=3.11"\ndependencies = [\n "fastapi",\n "uvicorn[standard]",\n "sqlalchemy",\n "pytest",\n "httpx",\n]\n`; +} + +// === Validaattori === +function validateProjectCode(files) { + const issues = []; + for (const [fname, code] of Object.entries(files)) { + if (!fname.endsWith('.py')) continue; + const lines = code.split('\n'); + for (const line of lines) { + const m = line.match(/^from\s+\.(\w*)\s+import/); + if (m) issues.push(`ISSUE: ${fname}: relatiivinen import`); + } + for (const line of lines) { + const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/); + if (!m) continue; + const srcCode = files[m[1] + '.py']; + if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; } + const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim()); + for (const name of names) { + if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`); + } + } + if (fname === 'schemas.py') { + if (/:\s*date\b/.test(code) && !/from datetime import/.test(code)) + issues.push('ISSUE: schemas.py: date-import puuttuu'); + if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code)) + issues.push('ISSUE: schemas.py: datetime-import puuttuu'); + } + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue; + if (/(? 0 && fixRound < MAX_FIX_ROUNDS) { + fixRound++; + console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`); + const issuesByFile = {}; + for (const issue of issues) { + const m = issue.match(/^ISSUE:\s*(\S+?):/); + const fname = m ? m[1] : 'unknown'; + if (!issuesByFile[fname]) issuesByFile[fname] = []; + issuesByFile[fname].push(issue); + } + for (const [fname, fIssues] of Object.entries(issuesByFile)) { + if (!files[fname]) continue; + const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``; + const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048); + timings.push(fixResp); + if (fixResp.text) { + files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n'; + } + } + issues = validateProjectCode(files); + } + result.validationIssues = issues.length; + result.fixRounds = fixRound; + + // Kirjoita tiedostot levylle + for (const [fn, content] of Object.entries(files)) writeFileSync(`${dir}/${fn}`, content); + + // 5. Pytest + console.log(` [5/5] Pytest...`); + try { + const uvPath = process.env.HOME + '/.local/bin/uv'; + const uv = existsSync(uvPath) ? uvPath : 'uv'; + execSync(`cd "${dir}" && ${uv} sync 2>/dev/null`, { timeout: 60000, stdio: 'pipe' }); + execSync(`cd "${dir}" && rm -f app.db test.db`, { stdio: 'pipe' }); + const pytestOut = execSync(`cd "${dir}" && ${uv} run pytest test_main.py -v --tb=short 2>&1`, { timeout: 60000, encoding: 'utf-8' }); + writeFileSync(`${dir}/_pytest.txt`, pytestOut); + + const passedMatch = pytestOut.match(/(\d+) passed/); + const failedMatch = pytestOut.match(/(\d+) failed/); + result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0; + result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0; + result.testsTotal = result.testsPassed + result.testsFailed; + } catch (e) { + const output = e.stdout || e.stderr || e.message || ''; + writeFileSync(`${dir}/_pytest.txt`, output); + const passedMatch = output.match(/(\d+) passed/); + const failedMatch = output.match(/(\d+) failed/); + const errorMatch = output.match(/(\d+) error/); + result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0; + result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0); + result.testsTotal = result.testsPassed + result.testsFailed; + if (result.testsTotal === 0) result.error = 'Pytest kaatui'; + } + } catch (e) { + result.error = e.message; + } + + // Yhteenveto + result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0); + result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0); + result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0; + + return result; +} + +// === Main === +async function main() { + console.log('╔══════════════════════════════════════════════╗'); + console.log('║ Kipinä Model Benchmark ║'); + console.log('╚══════════════════════════════════════════════╝'); + console.log(`Ollama: ${OLLAMA_URL}`); + + // Haetaan mallit + let models; + try { + models = await ollamaListModels(); + } catch (e) { + console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`); + process.exit(1); + } + + if (FILTER_MODELS) { + const filter = FILTER_MODELS.split(',').map(s => s.trim()); + models = models.filter(m => filter.some(f => m.includes(f))); + } + + console.log(`Mallit (${models.length}): ${models.join(', ')}`); + + const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]]; + console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`); + console.log(`Tulokset: ${OUTPUT_DIR}/`); + console.log(''); + + // Puhdista output + rmSync(OUTPUT_DIR, { recursive: true, force: true }); + mkdirSync(OUTPUT_DIR, { recursive: true }); + + const results = []; + + for (const model of models) { + for (const scenario of scenarios) { + console.log(`\n━━━ ${model} × ${scenario.id} ━━━`); + const r = await runPipeline(model, scenario); + results.push(r); + + const status = r.error ? `✗ ${r.error}` : + r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` : + `◐ ${r.testsPassed}/${r.testsTotal}`; + console.log(` → ${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s`); + } + } + + // === Tulostaulu === + console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗'); + console.log('║ TULOKSET ║'); + console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣'); + + const header = [ + 'Malli'.padEnd(40), + 'Skenaario'.padEnd(10), + 'Speksi'.padEnd(8), + 'Testit'.padEnd(10), + 'Korjaus'.padEnd(8), + 'Aika'.padEnd(8), + 'tok/s'.padEnd(8), + 'Tulos', + ].join(' │ '); + console.log(`║ ${header} ║`); + console.log('╠' + '═'.repeat(header.length + 2) + '╣'); + + for (const r of results) { + const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗'; + const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-'; + const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-'; + const time = `${(r.totalDurationMs/1000).toFixed(0)}s`; + const speed = `${r.avgTokPerSec.toFixed(0)}`; + const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL'; + + const row = [ + r.model.padEnd(40), + r.scenario.padEnd(10), + specStatus.padEnd(8), + testStatus.padEnd(10), + fixStatus.padEnd(8), + time.padEnd(8), + speed.padEnd(8), + verdict, + ].join(' │ '); + console.log(`║ ${row} ║`); + } + console.log('╚' + '═'.repeat(header.length + 2) + '╝'); + + // Tallenna JSON + writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2)); + console.log(`\nJSON: ${OUTPUT_DIR}/results.json`); + + // Yhteenveto + const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0); + const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0); + const failed = results.filter(r => r.error || r.testsTotal === 0); + console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`); +} + +main().catch(e => { console.error(e); process.exit(1); });