Standalone HTML-tiedosto joka sisältää: - Yhteenvetokortit (keskiarvo, paras malli, nopein, testit) - Mallikohtainen taulukko palkkikaavioilla - Yksittäiset tulokset sortattavassa taulussa - Dark mode, ei ulkoisia dependencyjä
568 lines
26 KiB
JavaScript
568 lines
26 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Kipinä Model Benchmark
|
||
*
|
||
* Generoi projekteja eri Ollama-malleilla ja testaa niiden toimivuus.
|
||
* Käyttö:
|
||
* node model-benchmark.mjs # kaikki mallit, oletusskenaario
|
||
* node model-benchmark.mjs --models qwen3:8b,qwen3:30b
|
||
* node model-benchmark.mjs --ollama http://host:11434
|
||
* node model-benchmark.mjs --scenarios all # kaikki skenaariot
|
||
*/
|
||
|
||
import { execSync } from 'child_process';
|
||
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs';
|
||
import { dirname, join } from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
|
||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||
|
||
// === CLI-argumentit ===
|
||
const args = process.argv.slice(2);
|
||
function arg(name, fallback) {
|
||
const i = args.indexOf(`--${name}`);
|
||
return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
|
||
}
|
||
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434');
|
||
const HUB_URL = arg('hub', ''); // Vaihtoehto: --hub https://kipina.studio
|
||
const FILTER_MODELS = arg('models', '');
|
||
const SCENARIO_FILTER = arg('scenarios', 'default');
|
||
const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark');
|
||
const MAX_FIX_ROUNDS = 2;
|
||
|
||
// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
|
||
function stripThinking(text) {
|
||
return text
|
||
.replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '') // gemma4
|
||
.replace(/<think>[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5
|
||
.trim();
|
||
}
|
||
|
||
// === Ollama / Hub -client ===
|
||
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
|
||
const start = Date.now();
|
||
|
||
if (HUB_URL) {
|
||
// Hub-reitti: /api/v1/chat/completions
|
||
const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
|
||
const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
|
||
});
|
||
if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
|
||
const data = await resp.json();
|
||
const elapsed = Date.now() - start;
|
||
return {
|
||
text: stripThinking((data.response || '').trim()),
|
||
tokens: data.tokens_generated || 0,
|
||
durationMs: elapsed,
|
||
tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
|
||
};
|
||
}
|
||
|
||
// Suora Ollama-reitti: /api/chat
|
||
const messages = [];
|
||
if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
|
||
messages.push({ role: 'user', content: prompt });
|
||
|
||
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({
|
||
model,
|
||
messages,
|
||
stream: false,
|
||
options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
|
||
}),
|
||
});
|
||
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
|
||
const data = await resp.json();
|
||
const elapsed = Date.now() - start;
|
||
const text = stripThinking((data.message?.content || '').trim());
|
||
const evalCount = data.eval_count || 0;
|
||
const evalDurationNs = data.eval_duration || 1;
|
||
const tokPerSec = evalCount / (evalDurationNs / 1e9);
|
||
return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
|
||
}
|
||
|
||
async function ollamaListModels() {
|
||
const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
|
||
const resp = await fetch(url);
|
||
if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
|
||
const data = await resp.json();
|
||
return (data.models || []).map(m => m.name);
|
||
}
|
||
|
||
// === Promptit (kopioitu index.astrosta) ===
|
||
const CLIENT_SYSTEM = `You are a product owner who turns vague ideas into clear, actionable software requirements.
|
||
|
||
GIVEN a short project description from the user, produce a structured brief:
|
||
|
||
1. PROJECT NAME: a short, descriptive name
|
||
2. GOAL: one sentence explaining what the software does and who it's for
|
||
3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes)
|
||
4. DATA MODEL: list the main entities and their key fields (include field types)
|
||
5. API ENDPOINTS: list the REST endpoints (method + path + purpose)
|
||
6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed")
|
||
|
||
RULES:
|
||
- Be specific: "User can filter todos by status" not "todo management"
|
||
- Use plain English, no code
|
||
- Maximum 400 words total`;
|
||
|
||
const SPEC_SYSTEM = `You are a software architect who designs database schemas for Python web applications.
|
||
|
||
THINK STEP BY STEP before outputting JSON:
|
||
1. What are the main ENTITIES (nouns) in this project?
|
||
2. What FIELDS does each entity need? (name, type, required?)
|
||
3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id)
|
||
4. Are there Date/DateTime fields? → add extra_imports
|
||
|
||
Then output ONLY valid JSON (no explanations before or after).
|
||
|
||
SCHEMA:
|
||
{"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]}
|
||
|
||
FIELD RULES:
|
||
- sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float
|
||
- py_type: str, int, float, bool, date, datetime — append " | None" if nullable
|
||
- Status fields: use String(20) with default value, NEVER Enum
|
||
- Every entity gets "id" automatically — do NOT add id or redundant ID fields
|
||
- Use snake_case for field names
|
||
|
||
RELATIONSHIP RULES:
|
||
- If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry
|
||
- EVERY _id field MUST have a matching relationship entry
|
||
- Parent entities must appear BEFORE children in the entities array
|
||
- If no relationships, set "relationships": []
|
||
|
||
AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish)
|
||
|
||
EXAMPLES (adapt, don't copy):
|
||
Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending")
|
||
Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")`;
|
||
|
||
const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.';
|
||
|
||
// === Kultainen esimerkki ===
|
||
const GOLDEN_DIR = join(__dirname, 'golden-examples', 'todo');
|
||
const GOLDEN_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
|
||
function loadGoldenExample() {
|
||
if (!existsSync(GOLDEN_DIR)) return '';
|
||
let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n';
|
||
for (const f of GOLDEN_FILES) {
|
||
const path = join(GOLDEN_DIR, f);
|
||
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
|
||
}
|
||
return example;
|
||
}
|
||
const GOLDEN_EXAMPLE = loadGoldenExample();
|
||
|
||
const CODE_SYSTEM = `You are a Python backend developer. Generate a FastAPI project with SQLAlchemy and SQLite.
|
||
|
||
Given the project requirements, JSON specification, and a REFERENCE IMPLEMENTATION, generate these 4 files:
|
||
|
||
1. models.py — SQLAlchemy 2.0: DeclarativeBase, Mapped, mapped_column (NOT legacy declarative_base)
|
||
2. schemas.py — Pydantic v2: ConfigDict(from_attributes=True) (NOT class Config)
|
||
3. main.py — FastAPI CRUD endpoints for each entity
|
||
4. test_main.py — Pytest with TestClient, separate test.db, unique test data per test
|
||
|
||
Do NOT generate pyproject.toml — it is created separately with uv.
|
||
|
||
OUTPUT FORMAT — use these exact markers to separate files:
|
||
|
||
=== models.py ===
|
||
<python code>
|
||
|
||
=== schemas.py ===
|
||
<python code>
|
||
|
||
=== main.py ===
|
||
<python code>
|
||
|
||
=== test_main.py ===
|
||
<python code>
|
||
|
||
DOCUMENTATION — every file must have a one-line module docstring. Classes get a one-line docstring. Keep it zensical: say what it IS, not what it does. No filler.
|
||
|
||
RULES:
|
||
- Follow the REFERENCE IMPLEMENTATION patterns exactly
|
||
- SQLAlchemy 2.0: DeclarativeBase + Mapped + mapped_column (not Column())
|
||
- Python type unions: str | None (not Optional[str])
|
||
- Tests: unique descriptive data per test, NOT generic "test_title" strings
|
||
- Absolute imports only (from models import ..., from schemas import ...)
|
||
- NO markdown fences inside file content — just raw code
|
||
- Only test endpoints that exist in main.py — no extra tests`;
|
||
|
||
// === Tiedostoparseri LLM-vastauksesta ===
|
||
function parseGeneratedFiles(text) {
|
||
const files = {};
|
||
const sections = text.split(/===\s*(\S+\.(?:py|toml))\s*===/);
|
||
// sections: [preamble, filename1, content1, filename2, content2, ...]
|
||
for (let i = 1; i < sections.length - 1; i += 2) {
|
||
const name = sections[i];
|
||
let content = sections[i + 1].trim();
|
||
// Poista mahdolliset markdown-aidat
|
||
content = content.replace(/^```(?:python|toml)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
|
||
if (content) files[name] = content + '\n';
|
||
}
|
||
return files;
|
||
}
|
||
|
||
// === Validaattori ===
|
||
function validateProjectCode(files) {
|
||
const issues = [];
|
||
for (const [fname, code] of Object.entries(files)) {
|
||
if (!fname.endsWith('.py')) continue;
|
||
const lines = code.split('\n');
|
||
for (const line of lines) {
|
||
const m = line.match(/^from\s+\.(\w*)\s+import/);
|
||
if (m) issues.push(`ISSUE: ${fname}: relatiivinen import`);
|
||
}
|
||
for (const line of lines) {
|
||
const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
|
||
if (!m) continue;
|
||
const srcCode = files[m[1] + '.py'];
|
||
if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
|
||
const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
|
||
for (const name of names) {
|
||
if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
|
||
}
|
||
}
|
||
if (fname === 'schemas.py') {
|
||
if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
|
||
issues.push('ISSUE: schemas.py: date-import puuttuu');
|
||
if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
|
||
issues.push('ISSUE: schemas.py: datetime-import puuttuu');
|
||
}
|
||
for (let i = 0; i < lines.length; i++) {
|
||
const line = lines[i];
|
||
if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
|
||
if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
|
||
if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
|
||
}
|
||
}
|
||
return issues;
|
||
}
|
||
|
||
function extractJson(text) {
|
||
const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
|
||
if (m) text = m[1].trim();
|
||
let depth = 0, start = null;
|
||
for (let i = 0; i < text.length; i++) {
|
||
if (text[i] === '{') { if (depth === 0) start = i; depth++; }
|
||
else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
|
||
}
|
||
return null;
|
||
}
|
||
|
||
// === Testiskenaariot ===
|
||
const SCENARIOS = [
|
||
{ id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
|
||
{ id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
|
||
{ id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
|
||
];
|
||
|
||
// === Pipeline: yhdelle mallille ja skenaariolle ===
|
||
async function runPipeline(model, scenario) {
|
||
const result = {
|
||
model, scenario: scenario.id,
|
||
reqOk: false, specOk: false, specEntities: 0,
|
||
validationIssues: 0, fixRounds: 0,
|
||
testsTotal: 0, testsPassed: 0, testsFailed: 0,
|
||
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
|
||
promptChars: 0, promptTokensEst: 0,
|
||
score: 0, stars: '',
|
||
error: null,
|
||
};
|
||
const timings = [];
|
||
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
|
||
mkdirSync(dir, { recursive: true });
|
||
|
||
try {
|
||
// 1. Vaatimukset
|
||
console.log(` [1/5] Vaatimukset...`);
|
||
const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 1024);
|
||
timings.push(req);
|
||
if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
|
||
result.reqOk = true;
|
||
writeFileSync(`${dir}/_requirements.txt`, req.text);
|
||
|
||
// 2. JSON-speksi
|
||
console.log(` [2/5] JSON-speksi...`);
|
||
const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 2048);
|
||
timings.push(specResp);
|
||
const spec = extractJson(specResp.text);
|
||
if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
|
||
result.specOk = true;
|
||
result.specEntities = spec.entities.length;
|
||
writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
|
||
|
||
// 3. LLM-koodigenerointi
|
||
console.log(` [3/5] Koodigenerointi (LLM)...`);
|
||
const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all 5 files. Follow the reference implementation patterns exactly.`;
|
||
result.promptChars = CODE_SYSTEM.length + codePrompt.length;
|
||
result.promptTokensEst = Math.round(result.promptChars / 4);
|
||
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, 8192);
|
||
timings.push(codeResp);
|
||
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
|
||
const files = parseGeneratedFiles(codeResp.text);
|
||
const required = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
|
||
const missing = required.filter(f => !files[f]);
|
||
if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
|
||
|
||
// 4. Validointi + korjaussilmukka
|
||
let issues = validateProjectCode(files);
|
||
let fixRound = 0;
|
||
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
|
||
fixRound++;
|
||
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
|
||
const issuesByFile = {};
|
||
for (const issue of issues) {
|
||
const m = issue.match(/^ISSUE:\s*(\S+?):/);
|
||
const fname = m ? m[1] : 'unknown';
|
||
if (!issuesByFile[fname]) issuesByFile[fname] = [];
|
||
issuesByFile[fname].push(issue);
|
||
}
|
||
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
|
||
if (!files[fname]) continue;
|
||
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
|
||
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
|
||
timings.push(fixResp);
|
||
if (fixResp.text) {
|
||
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
|
||
}
|
||
}
|
||
issues = validateProjectCode(files);
|
||
}
|
||
result.validationIssues = issues.length;
|
||
result.fixRounds = fixRound;
|
||
|
||
// Kirjoita LLM:n generoimat Python-tiedostot
|
||
for (const [fn, content] of Object.entries(files)) {
|
||
if (fn.endsWith('.py')) writeFileSync(`${dir}/${fn}`, content);
|
||
}
|
||
|
||
// 5. Pytest Docker-kontissa (kipina-pytest image)
|
||
console.log(` [5/5] Pytest (Docker)...`);
|
||
try {
|
||
const pytestOut = execSync(
|
||
`docker run --rm -v "${dir}:/src:ro" kipina-pytest 2>&1`,
|
||
{ timeout: 120000, encoding: 'utf-8' }
|
||
);
|
||
writeFileSync(`${dir}/_pytest.txt`, pytestOut);
|
||
|
||
const passedMatch = pytestOut.match(/(\d+) passed/);
|
||
const failedMatch = pytestOut.match(/(\d+) failed/);
|
||
result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
|
||
result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0;
|
||
result.testsTotal = result.testsPassed + result.testsFailed;
|
||
} catch (e) {
|
||
const output = e.stdout || e.stderr || e.message || '';
|
||
writeFileSync(`${dir}/_pytest.txt`, output);
|
||
const passedMatch = output.match(/(\d+) passed/);
|
||
const failedMatch = output.match(/(\d+) failed/);
|
||
const errorMatch = output.match(/(\d+) error/);
|
||
result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
|
||
result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0);
|
||
result.testsTotal = result.testsPassed + result.testsFailed;
|
||
if (result.testsTotal === 0) result.error = 'Pytest kaatui';
|
||
}
|
||
} catch (e) {
|
||
result.error = e.message;
|
||
}
|
||
|
||
// Yhteenveto
|
||
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
|
||
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
|
||
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
|
||
result.score = scoreResult(result);
|
||
result.stars = starsForScore(result.score);
|
||
|
||
return result;
|
||
}
|
||
|
||
// === Pisteytys (0–100) ja tähtiluokitus ===
|
||
function scoreResult(r) {
|
||
if (r.error && r.testsTotal === 0) return 0;
|
||
let score = 0;
|
||
// Speksi onnistui (10p)
|
||
if (r.specOk) score += 10;
|
||
// Koodi generoitu (10p)
|
||
if (!r.error || r.testsTotal > 0) score += 10;
|
||
// Testien läpäisy (60p)
|
||
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||
// Korjauskierrokset (20p: 0×=20, 1×=10, 2×=0)
|
||
score += Math.max(0, 20 - r.fixRounds * 10);
|
||
return Math.min(100, score);
|
||
}
|
||
function starsForScore(score) {
|
||
if (score >= 90) return '★★★★★';
|
||
if (score >= 70) return '★★★★☆';
|
||
if (score >= 50) return '★★★☆☆';
|
||
if (score >= 25) return '★★☆☆☆';
|
||
if (score > 0) return '★☆☆☆☆';
|
||
return '☆☆☆☆☆';
|
||
}
|
||
|
||
// === Main ===
|
||
async function main() {
|
||
console.log('╔══════════════════════════════════════════════╗');
|
||
console.log('║ Kipinä Model Benchmark ║');
|
||
console.log('╚══════════════════════════════════════════════╝');
|
||
console.log(`Ollama: ${OLLAMA_URL}`);
|
||
|
||
// Haetaan mallit
|
||
let models;
|
||
try {
|
||
models = await ollamaListModels();
|
||
} catch (e) {
|
||
console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
|
||
process.exit(1);
|
||
}
|
||
|
||
if (FILTER_MODELS) {
|
||
const filter = FILTER_MODELS.split(',').map(s => s.trim());
|
||
models = models.filter(m => filter.some(f => m.includes(f)));
|
||
}
|
||
|
||
console.log(`Mallit (${models.length}): ${models.join(', ')}`);
|
||
|
||
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
|
||
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
|
||
console.log(`Tulokset: ${OUTPUT_DIR}/`);
|
||
console.log('');
|
||
|
||
// Puhdista output
|
||
rmSync(OUTPUT_DIR, { recursive: true, force: true });
|
||
mkdirSync(OUTPUT_DIR, { recursive: true });
|
||
|
||
const results = [];
|
||
|
||
for (const model of models) {
|
||
for (const scenario of scenarios) {
|
||
console.log(`\n━━━ ${model} × ${scenario.id} ━━━`);
|
||
const r = await runPipeline(model, scenario);
|
||
results.push(r);
|
||
|
||
const status = r.error ? `✗ ${r.error}` :
|
||
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
|
||
`◐ ${r.testsPassed}/${r.testsTotal}`;
|
||
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
|
||
console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
||
}
|
||
}
|
||
|
||
// === Tulostaulu ===
|
||
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
|
||
console.log('║ TULOKSET ║');
|
||
console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
|
||
|
||
const header = [
|
||
'Malli'.padEnd(40),
|
||
'Skenaario'.padEnd(10),
|
||
'Speksi'.padEnd(8),
|
||
'Testit'.padEnd(10),
|
||
'Korjaus'.padEnd(8),
|
||
'Ctx'.padEnd(7),
|
||
'Aika'.padEnd(8),
|
||
'tok/s'.padEnd(8),
|
||
'Pisteet',
|
||
].join(' │ ');
|
||
console.log(`║ ${header} ║`);
|
||
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
|
||
|
||
for (const r of results) {
|
||
const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗';
|
||
const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
|
||
const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
|
||
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
|
||
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
||
const speed = `${r.avgTokPerSec.toFixed(0)}`;
|
||
const row = [
|
||
r.model.padEnd(40),
|
||
r.scenario.padEnd(10),
|
||
specStatus.padEnd(8),
|
||
testStatus.padEnd(10),
|
||
fixStatus.padEnd(8),
|
||
ctx.padEnd(7),
|
||
time.padEnd(8),
|
||
speed.padEnd(8),
|
||
`${r.stars} ${r.score}`,
|
||
].join(' │ ');
|
||
console.log(`║ ${row} ║`);
|
||
}
|
||
console.log('╚' + '═'.repeat(header.length + 2) + '╝');
|
||
|
||
// === Mallikohtainen yhteenveto ===
|
||
const modelNames = [...new Set(results.map(r => r.model))];
|
||
const scenarioIds = scenarios.map(s => s.id);
|
||
|
||
console.log('\n');
|
||
const mHeader = [
|
||
'Malli'.padEnd(35),
|
||
...scenarioIds.map(s => s.padEnd(22)),
|
||
'Yht.'.padEnd(8),
|
||
'Out'.padEnd(7),
|
||
'Aika'.padEnd(8),
|
||
'tok/s'.padEnd(7),
|
||
'Pisteet',
|
||
].join(' │ ');
|
||
console.log(mHeader);
|
||
console.log('─'.repeat(mHeader.length));
|
||
|
||
for (const model of modelNames) {
|
||
const mrs = results.filter(r => r.model === model);
|
||
const cols = scenarioIds.map(sid => {
|
||
const r = mrs.find(r => r.scenario === sid);
|
||
if (!r) return '-'.padEnd(22);
|
||
const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
|
||
const s = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
||
const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`;
|
||
return `${t} ${s} ${tok}`.padEnd(22);
|
||
});
|
||
const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0);
|
||
const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0);
|
||
const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0);
|
||
const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0);
|
||
const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0;
|
||
const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0;
|
||
const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0;
|
||
const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`;
|
||
const row = [
|
||
model.padEnd(35),
|
||
...cols,
|
||
`${totalPassed}/${totalTests}`.padEnd(8),
|
||
tokStr.padEnd(7),
|
||
`${(totalTime/1000).toFixed(0)}s`.padEnd(8),
|
||
`${avgSpeed}`.padEnd(7),
|
||
`${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`,
|
||
].join(' │ ');
|
||
console.log(row);
|
||
}
|
||
|
||
// Tallenna JSON + HTML-raportti
|
||
writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
|
||
const templatePath = join(__dirname, 'report-template.html');
|
||
if (existsSync(templatePath)) {
|
||
const html = readFileSync(templatePath, 'utf-8').replace(
|
||
'/*DATA_PLACEHOLDER*/[]',
|
||
JSON.stringify(results)
|
||
);
|
||
writeFileSync(`${OUTPUT_DIR}/report.html`, html);
|
||
console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`);
|
||
}
|
||
console.log(`JSON: ${OUTPUT_DIR}/results.json`);
|
||
|
||
// Yhteenveto
|
||
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
|
||
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
|
||
const failed = results.filter(r => r.error || r.testsTotal === 0);
|
||
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
|
||
const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
|
||
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
|
||
}
|
||
|
||
main().catch(e => { console.error(e); process.exit(1); });
|