1 Commits

Author SHA1 Message Date
20cea8f268 Model benchmark: testaa kaikki Ollama-mallit järjestelmällisesti
Ajaa täyden pipeline-kierroksen per malli × skenaario:
1. Client-prompti → vaatimukset
2. Manager/SPEC_SYSTEM → JSON-speksi
3. Template-generointi → koodi
4. Validointi + LLM-korjaussilmukka
5. uv sync + pytest

Tuottaa vertailutaulukon: speksin laatu, testien tulos, nopeus.
Tukee suoraa Ollamaa (--ollama) ja hub-reittiä (--hub).
2026-04-13 22:08:47 +03:00

View File

@@ -0,0 +1,513 @@
#!/usr/bin/env node
/**
* Kipinä Model Benchmark
*
* Generoi projekteja eri Ollama-malleilla ja testaa niiden toimivuus.
* Käyttö:
* node model-benchmark.mjs # kaikki mallit, oletusskenaario
* node model-benchmark.mjs --models qwen3:8b,qwen3:30b
* node model-benchmark.mjs --ollama http://host:11434
* node model-benchmark.mjs --scenarios all # kaikki skenaariot
*/
import { execSync } from 'child_process';
import { writeFileSync, mkdirSync, rmSync, existsSync } from 'fs';
// === CLI-argumentit ===
const args = process.argv.slice(2);
function arg(name, fallback) {
const i = args.indexOf(`--${name}`);
return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
}
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434');
const HUB_URL = arg('hub', ''); // Vaihtoehto: --hub https://kipina.studio
const FILTER_MODELS = arg('models', '');
const SCENARIO_FILTER = arg('scenarios', 'default');
const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark');
const MAX_FIX_ROUNDS = 2;
// === Ollama / Hub -client ===
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
const start = Date.now();
if (HUB_URL) {
// Hub-reitti: /api/v1/chat/completions
const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
});
if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
const data = await resp.json();
const elapsed = Date.now() - start;
return {
text: (data.response || '').trim(),
tokens: data.tokens_generated || 0,
durationMs: elapsed,
tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
};
}
// Suora Ollama-reitti: /api/chat
const messages = [];
if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
messages.push({ role: 'user', content: prompt });
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model,
messages,
stream: false,
options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
}),
});
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
const data = await resp.json();
const elapsed = Date.now() - start;
const text = (data.message?.content || '').trim();
const evalCount = data.eval_count || 0;
const evalDurationNs = data.eval_duration || 1;
const tokPerSec = evalCount / (evalDurationNs / 1e9);
return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
}
async function ollamaListModels() {
const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
const resp = await fetch(url);
if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
const data = await resp.json();
return (data.models || []).map(m => m.name);
}
// === Promptit (kopioitu index.astrosta) ===
const CLIENT_SYSTEM = `You are a product owner who turns vague ideas into clear, actionable software requirements.
GIVEN a short project description from the user, produce a structured brief:
1. PROJECT NAME: a short, descriptive name
2. GOAL: one sentence explaining what the software does and who it's for
3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes)
4. DATA MODEL: list the main entities and their key fields (include field types)
5. API ENDPOINTS: list the REST endpoints (method + path + purpose)
6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed")
RULES:
- Be specific: "User can filter todos by status" not "todo management"
- Use plain English, no code
- Maximum 400 words total`;
const SPEC_SYSTEM = `You are a software architect who designs database schemas for Python web applications.
THINK STEP BY STEP before outputting JSON:
1. What are the main ENTITIES (nouns) in this project?
2. What FIELDS does each entity need? (name, type, required?)
3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id)
4. Are there Date/DateTime fields? → add extra_imports
Then output ONLY valid JSON (no explanations before or after).
SCHEMA:
{"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]}
FIELD RULES:
- sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float
- py_type: str, int, float, bool, date, datetime — append " | None" if nullable
- Status fields: use String(20) with default value, NEVER Enum
- Every entity gets "id" automatically — do NOT add id or redundant ID fields
- Use snake_case for field names
RELATIONSHIP RULES:
- If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry
- EVERY _id field MUST have a matching relationship entry
- Parent entities must appear BEFORE children in the entities array
- If no relationships, set "relationships": []
AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish)
EXAMPLES (adapt, don't copy):
Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending")
Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")`;
const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.';
// === Template-funktiot (kopioitu korjatusta index.astrosta) ===
function pyLiteral(val) {
if (val === true) return 'True';
if (val === false) return 'False';
if (val === null || val === undefined) return 'None';
if (typeof val === 'string') return `"${val}"`;
return String(val);
}
function pyJsonLiteral(obj) {
const parts = Object.entries(obj).map(([k, v]) => {
let pyVal;
if (v === true) pyVal = 'True'; else if (v === false) pyVal = 'False';
else if (v === null) pyVal = 'None'; else if (typeof v === 'string') pyVal = `"${v}"`;
else pyVal = String(v);
return `"${k}":${pyVal}`;
});
return '{' + parts.join(',') + '}';
}
function tmplModels(spec) {
const saTypes = new Set(['Integer']);
for (const e of spec.entities) for (const f of e.fields) saTypes.add(f.sa_type.match(/^(\w+)/)[1]);
const relMap = {};
for (const r of (spec.relationships || [])) {
const target = spec.entities.find(e => e.name === r.to);
if (target) relMap[`${r.from}.${r.field}`] = target.table_name;
}
if (Object.keys(relMap).length > 0) saTypes.add('ForeignKey');
const imports = [...saTypes].sort().join(', ');
let code = `from sqlalchemy import create_engine, Column, ${imports}\nfrom sqlalchemy.orm import declarative_base, sessionmaker\n\nDATABASE_URL = "sqlite:///./app.db"\nengine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})\nSessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)\nBase = declarative_base()\n\n`;
for (const e of spec.entities) {
code += `class ${e.name}(Base):\n __tablename__ = "${e.table_name}"\n id = Column(Integer, primary_key=True, index=True)\n`;
for (const f of e.fields) {
const fkTarget = relMap[`${e.name}.${f.name}`];
let parts = fkTarget ? [`Column(${f.sa_type}, ForeignKey("${fkTarget}.id")`] : [`Column(${f.sa_type}`];
if (!f.nullable) parts.push('nullable=False');
if (f.default !== null && f.default !== undefined) parts.push(`default=${pyLiteral(f.default)}`);
code += ` ${f.name} = ${parts.join(', ')})\n`;
}
code += '\n';
}
code += 'Base.metadata.create_all(bind=engine)\n';
return code;
}
function tmplSchemas(spec) {
const dtTypes = new Set();
for (const e of spec.entities) for (const f of e.fields) {
if (/\bdate\b/i.test(f.py_type) && !/datetime/.test(f.py_type)) dtTypes.add('date');
if (/\bdatetime\b/i.test(f.py_type)) dtTypes.add('datetime');
}
let code = 'from pydantic import BaseModel, ConfigDict\n';
if (dtTypes.size > 0) code += `from datetime import ${[...dtTypes].sort().join(', ')}\n`;
for (const imp of (spec.extra_imports || [])) {
if (/^(date|datetime)$/.test(imp.trim())) continue;
if (/^from\s/.test(imp) || /^import\s/.test(imp)) code += imp + '\n';
}
code += '\n';
for (const e of spec.entities) {
code += `class ${e.name}Create(BaseModel):\n`;
for (const f of e.fields) {
if (f.default !== null && f.default !== undefined) code += ` ${f.name}: ${f.py_type} = ${pyLiteral(f.default)}\n`;
else if (f.nullable && f.py_type.includes('None')) code += ` ${f.name}: ${f.py_type} = None\n`;
else code += ` ${f.name}: ${f.py_type}\n`;
}
code += `\nclass ${e.name}Response(${e.name}Create):\n id: int\n model_config = ConfigDict(from_attributes=True)\n\n`;
}
return code;
}
function tmplMain(spec) {
const modelNames = spec.entities.map(e => e.name).join(', ');
const createNames = spec.entities.map(e => e.name+'Create').join(', ');
const responseNames = spec.entities.map(e => e.name+'Response').join(', ');
let code = `from fastapi import FastAPI, Depends, HTTPException\nfrom sqlalchemy.orm import Session\nfrom models import Base, engine, SessionLocal, ${modelNames}\nfrom schemas import ${createNames}, ${responseNames}\n\napp = FastAPI()\n\ndef get_db():\n db = SessionLocal()\n try:\n yield db\n finally:\n db.close()\n\n`;
for (const e of spec.entities) {
const lo = e.name.toLowerCase(), tb = e.table_name;
code += `@app.post("/${tb}/", response_model=${e.name}Response, status_code=201)\ndef create_${lo}(item: ${e.name}Create, db: Session = Depends(get_db)):\n db_item = ${e.name}(**item.model_dump())\n db.add(db_item)\n db.commit()\n db.refresh(db_item)\n return db_item\n\n`;
code += `@app.get("/${tb}/", response_model=list[${e.name}Response])\ndef list_${lo}s(db: Session = Depends(get_db)):\n return db.query(${e.name}).all()\n\n`;
code += `@app.get("/${tb}/{item_id}", response_model=${e.name}Response)\ndef get_${lo}(item_id: int, db: Session = Depends(get_db)):\n item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n if not item:\n raise HTTPException(status_code=404, detail="${e.name} not found")\n return item\n\n`;
code += `@app.put("/${tb}/{item_id}", response_model=${e.name}Response)\ndef update_${lo}(item_id: int, item: ${e.name}Create, db: Session = Depends(get_db)):\n db_item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n if not db_item:\n raise HTTPException(status_code=404, detail="${e.name} not found")\n for key, value in item.model_dump().items():\n setattr(db_item, key, value)\n db.commit()\n db.refresh(db_item)\n return db_item\n\n`;
code += `@app.delete("/${tb}/{item_id}", status_code=204)\ndef delete_${lo}(item_id: int, db: Session = Depends(get_db)):\n db_item = db.query(${e.name}).filter(${e.name}.id == item_id).first()\n if not db_item:\n raise HTTPException(status_code=404, detail="${e.name} not found")\n db.delete(db_item)\n db.commit()\n\n`;
}
return code;
}
function tmplTests(spec) {
let code = `from fastapi.testclient import TestClient\nfrom sqlalchemy import create_engine\nfrom sqlalchemy.orm import sessionmaker\nfrom main import app, get_db\nfrom models import Base\n\nTEST_DB = "sqlite:///./test.db"\ntest_engine = create_engine(TEST_DB, connect_args={"check_same_thread": False})\nTestSession = sessionmaker(autocommit=False, autoflush=False, bind=test_engine)\nBase.metadata.create_all(bind=test_engine)\n\ndef override_get_db():\n db = TestSession()\n try:\n yield db\n finally:\n db.close()\n\napp.dependency_overrides[get_db] = override_get_db\nclient = TestClient(app)\n\n`;
for (const e of spec.entities) {
const lo = e.name.toLowerCase(), tb = e.table_name;
const testData = {};
for (const f of e.fields) {
if (f.default !== null && f.default !== undefined) { testData[f.name] = f.default; continue; }
if (f.py_type.includes('str')) testData[f.name] = `Test ${f.name}`;
else if (f.py_type.includes('int')) testData[f.name] = 1;
else if (f.py_type.includes('float')) testData[f.name] = 1.0;
else if (f.py_type.includes('bool')) testData[f.name] = true;
else if (f.py_type.includes('date')) testData[f.name] = '2024-01-15';
}
const td = pyJsonLiteral(testData);
const firstStr = e.fields.find(f => f.py_type.includes('str') && f.name !== 'status');
const updateData = {...testData};
if (firstStr) updateData[firstStr.name] = `Updated ${firstStr.name}`;
const ud = pyJsonLiteral(updateData);
code += `def test_create_${lo}():\n response = client.post('/${tb}/', json=${td})\n assert response.status_code == 201\n assert 'id' in response.json()\n\n`;
code += `def test_list_${lo}s():\n client.post('/${tb}/', json=${td})\n response = client.get('/${tb}/')\n assert response.status_code == 200\n assert len(response.json()) >= 1\n\n`;
code += `def test_get_${lo}_by_id():\n created = client.post('/${tb}/', json=${td}).json()\n item_id = created['id']\n response = client.get(f'/${tb}/{item_id}')\n assert response.status_code == 200\n assert response.json()['id'] == item_id\n\n`;
code += `def test_get_${lo}_not_found():\n response = client.get('/${tb}/99999')\n assert response.status_code == 404\n\n`;
code += `def test_update_${lo}():\n created = client.post('/${tb}/', json=${td}).json()\n item_id = created['id']\n response = client.put(f'/${tb}/{item_id}', json=${ud})\n assert response.status_code == 200\n\n`;
code += `def test_delete_${lo}():\n created = client.post('/${tb}/', json=${td}).json()\n item_id = created['id']\n response = client.delete(f'/${tb}/{item_id}')\n assert response.status_code == 204\n response = client.get(f'/${tb}/{item_id}')\n assert response.status_code == 404\n\n`;
}
return code;
}
function tmplPyproject(spec) {
const name = (spec.project_name || 'app').toLowerCase().replace(/\s+/g, '-');
return `[project]\nname = "${name}"\nversion = "0.1.0"\nrequires-python = ">=3.11"\ndependencies = [\n "fastapi",\n "uvicorn[standard]",\n "sqlalchemy",\n "pytest",\n "httpx",\n]\n`;
}
// === Validaattori ===
function validateProjectCode(files) {
const issues = [];
for (const [fname, code] of Object.entries(files)) {
if (!fname.endsWith('.py')) continue;
const lines = code.split('\n');
for (const line of lines) {
const m = line.match(/^from\s+\.(\w*)\s+import/);
if (m) issues.push(`ISSUE: ${fname}: relatiivinen import`);
}
for (const line of lines) {
const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
if (!m) continue;
const srcCode = files[m[1] + '.py'];
if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
for (const name of names) {
if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
}
}
if (fname === 'schemas.py') {
if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
issues.push('ISSUE: schemas.py: date-import puuttuu');
if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
issues.push('ISSUE: schemas.py: datetime-import puuttuu');
}
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
}
}
return issues;
}
function extractJson(text) {
const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
if (m) text = m[1].trim();
let depth = 0, start = null;
for (let i = 0; i < text.length; i++) {
if (text[i] === '{') { if (depth === 0) start = i; depth++; }
else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
}
return null;
}
// === Testiskenaariot ===
const SCENARIOS = [
{ id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
{ id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
{ id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
];
// === Pipeline: yhdelle mallille ja skenaariolle ===
async function runPipeline(model, scenario) {
const result = {
model, scenario: scenario.id,
reqOk: false, specOk: false, specEntities: 0,
validationIssues: 0, fixRounds: 0,
testsTotal: 0, testsPassed: 0, testsFailed: 0,
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
error: null,
};
const timings = [];
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
mkdirSync(dir, { recursive: true });
try {
// 1. Vaatimukset
console.log(` [1/5] Vaatimukset...`);
const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 1024);
timings.push(req);
if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
result.reqOk = true;
writeFileSync(`${dir}/_requirements.txt`, req.text);
// 2. JSON-speksi
console.log(` [2/5] JSON-speksi...`);
const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 2048);
timings.push(specResp);
const spec = extractJson(specResp.text);
if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
result.specOk = true;
result.specEntities = spec.entities.length;
writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
// 3. Template-generointi
console.log(` [3/5] Koodigenerointi...`);
const files = {
'models.py': tmplModels(spec),
'schemas.py': tmplSchemas(spec),
'main.py': tmplMain(spec),
'test_main.py': tmplTests(spec),
'pyproject.toml': tmplPyproject(spec),
};
// 4. Validointi + korjaussilmukka
let issues = validateProjectCode(files);
let fixRound = 0;
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
fixRound++;
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
const issuesByFile = {};
for (const issue of issues) {
const m = issue.match(/^ISSUE:\s*(\S+?):/);
const fname = m ? m[1] : 'unknown';
if (!issuesByFile[fname]) issuesByFile[fname] = [];
issuesByFile[fname].push(issue);
}
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
if (!files[fname]) continue;
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
timings.push(fixResp);
if (fixResp.text) {
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
}
}
issues = validateProjectCode(files);
}
result.validationIssues = issues.length;
result.fixRounds = fixRound;
// Kirjoita tiedostot levylle
for (const [fn, content] of Object.entries(files)) writeFileSync(`${dir}/${fn}`, content);
// 5. Pytest
console.log(` [5/5] Pytest...`);
try {
const uvPath = process.env.HOME + '/.local/bin/uv';
const uv = existsSync(uvPath) ? uvPath : 'uv';
execSync(`cd "${dir}" && ${uv} sync 2>/dev/null`, { timeout: 60000, stdio: 'pipe' });
execSync(`cd "${dir}" && rm -f app.db test.db`, { stdio: 'pipe' });
const pytestOut = execSync(`cd "${dir}" && ${uv} run pytest test_main.py -v --tb=short 2>&1`, { timeout: 60000, encoding: 'utf-8' });
writeFileSync(`${dir}/_pytest.txt`, pytestOut);
const passedMatch = pytestOut.match(/(\d+) passed/);
const failedMatch = pytestOut.match(/(\d+) failed/);
result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0;
result.testsTotal = result.testsPassed + result.testsFailed;
} catch (e) {
const output = e.stdout || e.stderr || e.message || '';
writeFileSync(`${dir}/_pytest.txt`, output);
const passedMatch = output.match(/(\d+) passed/);
const failedMatch = output.match(/(\d+) failed/);
const errorMatch = output.match(/(\d+) error/);
result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0);
result.testsTotal = result.testsPassed + result.testsFailed;
if (result.testsTotal === 0) result.error = 'Pytest kaatui';
}
} catch (e) {
result.error = e.message;
}
// Yhteenveto
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
return result;
}
// === Main ===
async function main() {
console.log('╔══════════════════════════════════════════════╗');
console.log('║ Kipinä Model Benchmark ║');
console.log('╚══════════════════════════════════════════════╝');
console.log(`Ollama: ${OLLAMA_URL}`);
// Haetaan mallit
let models;
try {
models = await ollamaListModels();
} catch (e) {
console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
process.exit(1);
}
if (FILTER_MODELS) {
const filter = FILTER_MODELS.split(',').map(s => s.trim());
models = models.filter(m => filter.some(f => m.includes(f)));
}
console.log(`Mallit (${models.length}): ${models.join(', ')}`);
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
console.log(`Tulokset: ${OUTPUT_DIR}/`);
console.log('');
// Puhdista output
rmSync(OUTPUT_DIR, { recursive: true, force: true });
mkdirSync(OUTPUT_DIR, { recursive: true });
const results = [];
for (const model of models) {
for (const scenario of scenarios) {
console.log(`\n━━━ ${model} × ${scenario.id} ━━━`);
const r = await runPipeline(model, scenario);
results.push(r);
const status = r.error ? `${r.error}` :
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` :
`${r.testsPassed}/${r.testsTotal}`;
console.log(`${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s`);
}
}
// === Tulostaulu ===
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
console.log('║ TULOKSET ║');
console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
const header = [
'Malli'.padEnd(40),
'Skenaario'.padEnd(10),
'Speksi'.padEnd(8),
'Testit'.padEnd(10),
'Korjaus'.padEnd(8),
'Aika'.padEnd(8),
'tok/s'.padEnd(8),
'Tulos',
].join(' │ ');
console.log(`${header}`);
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
for (const r of results) {
const specStatus = r.specOk ? `${r.specEntities}e` : '✗';
const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
const speed = `${r.avgTokPerSec.toFixed(0)}`;
const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL';
const row = [
r.model.padEnd(40),
r.scenario.padEnd(10),
specStatus.padEnd(8),
testStatus.padEnd(10),
fixStatus.padEnd(8),
time.padEnd(8),
speed.padEnd(8),
verdict,
].join(' │ ');
console.log(`${row}`);
}
console.log('╚' + '═'.repeat(header.length + 2) + '╝');
// Tallenna JSON
writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
console.log(`\nJSON: ${OUTPUT_DIR}/results.json`);
// Yhteenveto
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
const failed = results.filter(r => r.error || r.testsTotal === 0);
console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
}
main().catch(e => { console.error(e); process.exit(1); });