593 lines
28 KiB
JavaScript
593 lines
28 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Kipinä CodeBench — LLM-koodingenerointibenchmark
|
||
*
|
||
* Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa.
|
||
*
|
||
* Käyttö:
|
||
* node benchmark.mjs # kaikki mallit, oletusskenaario
|
||
* node benchmark.mjs --models qwen3-coder:30b # yksi malli
|
||
* node benchmark.mjs --ollama http://host:11434 # eri Ollama
|
||
* node benchmark.mjs --scenarios all # kaikki skenaariot
|
||
* node benchmark.mjs --output ./results/run-001 # custom output-hakemisto
|
||
*/
|
||
|
||
import { execSync } from 'child_process';
|
||
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs';
|
||
import { dirname, join } from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
|
||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||
|
||
// === CLI-argumentit ===
|
||
const args = process.argv.slice(2);
|
||
function arg(name, fallback) {
|
||
const i = args.indexOf(`--${name}`);
|
||
return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
|
||
}
|
||
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434');
|
||
const HUB_URL = arg('hub', '');
|
||
const FILTER_MODELS = arg('models', '');
|
||
const SCENARIO_FILTER = arg('scenarios', 'default');
|
||
const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 16);
|
||
const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`);
|
||
const RESULTS_DIR = join(__dirname, 'results');
|
||
const THINK_MODE = args.includes('--think');
|
||
const COMPACT_MODE = args.includes('--compact');
|
||
const LANG = arg('lang', 'python'); // python | rust
|
||
const ROUNDS = parseInt(arg('rounds', '1')); // 1-10 toistoa
|
||
const MAX_FIX_ROUNDS = 2;
|
||
|
||
// === Promptien lataus tiedostoista ===
|
||
function loadPrompt(name) {
|
||
const path = join(__dirname, 'prompts', `${name}.md`);
|
||
if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`);
|
||
return readFileSync(path, 'utf-8').trim();
|
||
}
|
||
const CLIENT_SYSTEM = loadPrompt('client');
|
||
const SPEC_SYSTEM = loadPrompt('spec');
|
||
const CODE_SYSTEM = loadPrompt(LANG === 'rust' ? 'code-rs' : 'code');
|
||
const FIX_SYSTEM = loadPrompt('fix');
|
||
|
||
// === Kultaisten esimerkkien lataus (kielen mukaan) ===
|
||
const GOLDEN_DIR = join(__dirname, 'golden-examples');
|
||
const LANG_CONFIG = {
|
||
python: {
|
||
goldenDir: 'todo',
|
||
files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
|
||
required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
|
||
dockerImage: 'kipina-pytest',
|
||
},
|
||
rust: {
|
||
goldenDir: 'todo-rs',
|
||
files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
|
||
required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
|
||
dockerImage: 'kipina-cargo-test',
|
||
},
|
||
};
|
||
const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python;
|
||
|
||
function loadGoldenExample() {
|
||
// --compact: käytä tiivistettyä templaattia täyden koodin sijaan
|
||
if (COMPACT_MODE) {
|
||
const compactFile = LANG === 'rust' ? 'golden-compact-rs.md' : 'golden-compact-py.md';
|
||
const compactPath = join(__dirname, 'prompts', compactFile);
|
||
if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n';
|
||
}
|
||
// Täysi golden example
|
||
const todoDir = join(GOLDEN_DIR, LCONF.goldenDir);
|
||
if (!existsSync(todoDir)) return '';
|
||
let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`;
|
||
for (const f of LCONF.files) {
|
||
const path = join(todoDir, f);
|
||
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
|
||
}
|
||
return example;
|
||
}
|
||
const GOLDEN_EXAMPLE = loadGoldenExample();
|
||
|
||
// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
|
||
function stripThinking(text) {
|
||
return text
|
||
.replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '') // gemma4
|
||
.replace(/<think>[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5
|
||
.trim();
|
||
}
|
||
|
||
// === Ollama / Hub -client ===
|
||
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
|
||
const start = Date.now();
|
||
|
||
if (HUB_URL) {
|
||
const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
|
||
const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
|
||
});
|
||
if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
|
||
const data = await resp.json();
|
||
const elapsed = Date.now() - start;
|
||
return {
|
||
text: stripThinking((data.response || '').trim()),
|
||
tokens: data.tokens_generated || 0,
|
||
durationMs: elapsed,
|
||
tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
|
||
};
|
||
}
|
||
|
||
// Suora Ollama-reitti
|
||
const messages = [];
|
||
if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
|
||
messages.push({ role: 'user', content: prompt });
|
||
|
||
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({
|
||
model,
|
||
messages,
|
||
stream: false,
|
||
think: THINK_MODE,
|
||
options: { num_predict: THINK_MODE ? maxTokens * 3 : maxTokens, num_ctx: 16384, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
|
||
}),
|
||
});
|
||
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
|
||
const data = await resp.json();
|
||
const elapsed = Date.now() - start;
|
||
const rawContent = (data.message?.content || '').trim();
|
||
const thinking = (data.message?.thinking || '').trim();
|
||
const text = stripThinking(rawContent || thinking);
|
||
const evalCount = data.eval_count || 0;
|
||
if (!rawContent && thinking) console.log(` ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`);
|
||
const evalDurationNs = data.eval_duration || 1;
|
||
const tokPerSec = evalCount / (evalDurationNs / 1e9);
|
||
return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
|
||
}
|
||
|
||
async function ollamaListModels() {
|
||
const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
|
||
const resp = await fetch(url);
|
||
if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
|
||
const data = await resp.json();
|
||
return (data.models || []).map(m => m.name);
|
||
}
|
||
|
||
// === Testitulosten parsinta (pytest + cargo test) ===
|
||
function parseTestOutput(output) {
|
||
// Pytest: "6 passed", "2 failed", "1 error"
|
||
const pyPassed = output.match(/(\d+) passed/);
|
||
const pyFailed = output.match(/(\d+) failed/);
|
||
const pyError = output.match(/(\d+) error/);
|
||
if (pyPassed || pyFailed) {
|
||
const passed = pyPassed ? parseInt(pyPassed[1]) : 0;
|
||
const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0);
|
||
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
|
||
}
|
||
// Cargo test: "test result: ok. 10 passed; 0 failed;"
|
||
const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/);
|
||
if (cargoMatch) {
|
||
const passed = parseInt(cargoMatch[1]);
|
||
const failed = parseInt(cargoMatch[2]);
|
||
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
|
||
}
|
||
// Cargo compilation error: count "error[E" occurrences
|
||
const compileErrors = (output.match(/error\[E\d+\]/g) || []).length;
|
||
if (compileErrors > 0) {
|
||
return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors };
|
||
}
|
||
return { testsPassed: 0, testsFailed: 0, testsTotal: 0 };
|
||
}
|
||
|
||
// === Tiedostoparseri LLM-vastauksesta ===
|
||
function parseGeneratedFiles(text) {
|
||
const files = {};
|
||
const sections = text.split(/===\s*(\S+\.(?:py|toml|rs))\s*===/);
|
||
for (let i = 1; i < sections.length - 1; i += 2) {
|
||
const name = sections[i];
|
||
let content = sections[i + 1].trim();
|
||
content = content.replace(/^```(?:python|toml|rust)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
|
||
if (content) files[name] = content + '\n';
|
||
}
|
||
return files;
|
||
}
|
||
|
||
// === Validaattori ===
|
||
function validateProjectCode(files) {
|
||
const issues = [];
|
||
for (const [fname, code] of Object.entries(files)) {
|
||
if (!fname.endsWith('.py')) continue;
|
||
const lines = code.split('\n');
|
||
for (const line of lines) {
|
||
if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`);
|
||
}
|
||
for (const line of lines) {
|
||
const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
|
||
if (!m) continue;
|
||
const srcCode = files[m[1] + '.py'];
|
||
if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
|
||
const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
|
||
for (const name of names) {
|
||
if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
|
||
}
|
||
}
|
||
if (fname === 'schemas.py') {
|
||
if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
|
||
issues.push('ISSUE: schemas.py: date-import puuttuu');
|
||
if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
|
||
issues.push('ISSUE: schemas.py: datetime-import puuttuu');
|
||
}
|
||
for (let i = 0; i < lines.length; i++) {
|
||
const line = lines[i];
|
||
if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
|
||
if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
|
||
if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
|
||
}
|
||
}
|
||
return issues;
|
||
}
|
||
|
||
function extractJson(text) {
|
||
const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
|
||
if (m) text = m[1].trim();
|
||
let depth = 0, start = null;
|
||
for (let i = 0; i < text.length; i++) {
|
||
if (text[i] === '{') { if (depth === 0) start = i; depth++; }
|
||
else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
|
||
}
|
||
return null;
|
||
}
|
||
|
||
// === Testiskenaariot ===
|
||
const SCENARIOS = [
|
||
{ id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
|
||
{ id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
|
||
{ id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
|
||
];
|
||
|
||
// === Pisteytys (0–100) ja tähtiluokitus ===
|
||
function scoreResult(r) {
|
||
if (r.error && r.testsTotal === 0) return 0;
|
||
let score = 0;
|
||
if (r.specOk) score += 10;
|
||
if (!r.error || r.testsTotal > 0) score += 10;
|
||
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||
score += Math.max(0, 20 - r.fixRounds * 10);
|
||
return Math.min(100, score);
|
||
}
|
||
function starsForScore(score) {
|
||
if (score >= 90) return '★★★★★';
|
||
if (score >= 70) return '★★★★☆';
|
||
if (score >= 50) return '★★★☆☆';
|
||
if (score >= 25) return '★★☆☆☆';
|
||
if (score > 0) return '★☆☆☆☆';
|
||
return '☆☆☆☆☆';
|
||
}
|
||
|
||
// === Pipeline: yhdelle mallille ja skenaariolle ===
|
||
async function runPipeline(model, scenario) {
|
||
const result = {
|
||
model, scenario: scenario.id,
|
||
reqOk: false, specOk: false, specEntities: 0,
|
||
validationIssues: 0, fixRounds: 0,
|
||
testsTotal: 0, testsPassed: 0, testsFailed: 0,
|
||
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
|
||
promptChars: 0, promptTokensEst: 0,
|
||
score: 0, stars: '',
|
||
error: null,
|
||
};
|
||
const timings = [];
|
||
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
|
||
mkdirSync(dir, { recursive: true });
|
||
|
||
try {
|
||
// 1. Vaatimukset
|
||
console.log(` [1/5] Vaatimukset...`);
|
||
const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 2048);
|
||
timings.push(req);
|
||
if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
|
||
result.reqOk = true;
|
||
writeFileSync(`${dir}/_requirements.txt`, req.text);
|
||
|
||
// 2. JSON-speksi
|
||
console.log(` [2/5] JSON-speksi...`);
|
||
const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 4096);
|
||
timings.push(specResp);
|
||
const spec = extractJson(specResp.text);
|
||
if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
|
||
result.specOk = true;
|
||
result.specEntities = spec.entities.length;
|
||
writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
|
||
|
||
// 3. LLM-koodigenerointi
|
||
console.log(` [3/5] Koodigenerointi (LLM)...`);
|
||
const fileCount = LCONF.required.length;
|
||
const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`;
|
||
result.promptChars = CODE_SYSTEM.length + codePrompt.length;
|
||
result.promptTokensEst = Math.round(result.promptChars / 4);
|
||
const codeTokens = LANG === 'rust' ? 12288 : 8192;
|
||
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens);
|
||
timings.push(codeResp);
|
||
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
|
||
const files = parseGeneratedFiles(codeResp.text);
|
||
const missing = LCONF.required.filter(f => !files[f]);
|
||
if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
|
||
|
||
// 4. Validointi + korjaussilmukka (Python-spesifi)
|
||
let fixRound = 0;
|
||
if (LANG === 'python') {
|
||
let issues = validateProjectCode(files);
|
||
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
|
||
fixRound++;
|
||
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
|
||
const issuesByFile = {};
|
||
for (const issue of issues) {
|
||
const m = issue.match(/^ISSUE:\s*(\S+?):/);
|
||
const fname = m ? m[1] : 'unknown';
|
||
if (!issuesByFile[fname]) issuesByFile[fname] = [];
|
||
issuesByFile[fname].push(issue);
|
||
}
|
||
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
|
||
if (!files[fname]) continue;
|
||
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
|
||
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
|
||
timings.push(fixResp);
|
||
if (fixResp.text) {
|
||
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
|
||
}
|
||
}
|
||
issues = validateProjectCode(files);
|
||
}
|
||
result.validationIssues = issues.length;
|
||
}
|
||
result.fixRounds = fixRound;
|
||
|
||
// Kirjoita LLM:n generoimat tiedostot (luo src/ ja tests/ alihakemistot tarvittaessa)
|
||
for (const [fn, content] of Object.entries(files)) {
|
||
const filePath = join(dir, fn);
|
||
mkdirSync(dirname(filePath), { recursive: true });
|
||
writeFileSync(filePath, content);
|
||
}
|
||
|
||
// 5. Testit Docker-kontissa
|
||
const testLabel = LANG === 'rust' ? 'Cargo test (Docker)' : 'Pytest (Docker)';
|
||
console.log(` [5/5] ${testLabel}...`);
|
||
const dockerTimeout = LANG === 'rust' ? 300000 : 120000;
|
||
try {
|
||
const testOut = execSync(
|
||
`docker run --rm -v "${dir}:/src:ro" ${LCONF.dockerImage} 2>&1`,
|
||
{ timeout: dockerTimeout, encoding: 'utf-8' }
|
||
);
|
||
writeFileSync(`${dir}/_testout.txt`, testOut);
|
||
Object.assign(result, parseTestOutput(testOut));
|
||
} catch (e) {
|
||
const output = e.stdout || e.stderr || e.message || '';
|
||
writeFileSync(`${dir}/_testout.txt`, output);
|
||
Object.assign(result, parseTestOutput(output));
|
||
if (result.testsTotal === 0) result.error = 'Testit kaatuivat';
|
||
}
|
||
} catch (e) {
|
||
result.error = e.message;
|
||
}
|
||
|
||
// Yhteenveto
|
||
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
|
||
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
|
||
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
|
||
result.score = scoreResult(result);
|
||
result.stars = starsForScore(result.score);
|
||
|
||
return result;
|
||
}
|
||
|
||
// === Main ===
|
||
async function main() {
|
||
console.log('╔══════════════════════════════════════════════╗');
|
||
console.log('║ Kipinä CodeBench ║');
|
||
console.log('╚══════════════════════════════════════════════╝');
|
||
console.log(`Ollama: ${OLLAMA_URL} 📝 ${LANG}${COMPACT_MODE ? ' (compact)' : ''}${THINK_MODE ? ' 🧠 thinking ON' : ''}`);
|
||
|
||
// Haetaan mallit
|
||
let models;
|
||
try {
|
||
models = await ollamaListModels();
|
||
} catch (e) {
|
||
console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
|
||
process.exit(1);
|
||
}
|
||
|
||
if (FILTER_MODELS) {
|
||
const filter = FILTER_MODELS.split(',').map(s => s.trim());
|
||
models = models.filter(m => filter.some(f => m.includes(f)));
|
||
}
|
||
|
||
console.log(`Mallit (${models.length}): ${models.join(', ')}`);
|
||
|
||
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
|
||
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
|
||
if (ROUNDS > 1) console.log(`Toistoja: ${ROUNDS}`);
|
||
console.log(`Tulokset: ${OUTPUT_DIR}/`);
|
||
console.log('');
|
||
|
||
// Puhdista output
|
||
rmSync(OUTPUT_DIR, { recursive: true, force: true });
|
||
mkdirSync(OUTPUT_DIR, { recursive: true });
|
||
|
||
const results = [];
|
||
|
||
for (let round = 1; round <= ROUNDS; round++) {
|
||
if (ROUNDS > 1) console.log(`\n╔═══ Kierros ${round}/${ROUNDS} ═══╗`);
|
||
for (const model of models) {
|
||
for (const scenario of scenarios) {
|
||
const roundLabel = ROUNDS > 1 ? ` [${round}/${ROUNDS}]` : '';
|
||
console.log(`\n━━━ ${model} × ${scenario.id}${roundLabel} ━━━`);
|
||
const r = await runPipeline(model, scenario);
|
||
if (ROUNDS > 1) r.round = round;
|
||
results.push(r);
|
||
|
||
const status = r.error ? `✗ ${r.error}` :
|
||
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
|
||
`◐ ${r.testsPassed}/${r.testsTotal}`;
|
||
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
|
||
console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
||
}
|
||
}
|
||
// Väliraportti kierroksen jälkeen
|
||
if (ROUNDS > 1) {
|
||
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
|
||
console.log(`\n┌─── Tilanne kierroksen ${round}/${ROUNDS} jälkeen ───┐`);
|
||
for (const model of [...new Set(results.map(r => r.model))]) {
|
||
const mrs = results.filter(r => r.model === model);
|
||
for (const sid of scenarios.map(s => s.id)) {
|
||
const runs = mrs.filter(r => r.scenario === sid);
|
||
if (runs.length === 0) continue;
|
||
const scores = runs.map(r => r.score);
|
||
const med = median(scores);
|
||
const last = scores[scores.length - 1];
|
||
const trend = scores.length > 1 ? (last > scores[scores.length - 2] ? '▲' : last < scores[scores.length - 2] ? '▼' : '─') : '';
|
||
console.log(`│ ${model.padEnd(28)} ${sid.padEnd(7)} ${starsForScore(med)} med:${String(med).padStart(3)}p [${scores.join(',')}] ${trend}`);
|
||
}
|
||
}
|
||
console.log(`└${'─'.repeat(45)}┘`);
|
||
}
|
||
} // rounds
|
||
|
||
// === Tulostaulu ===
|
||
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
|
||
console.log('║ TULOKSET ║');
|
||
console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
|
||
|
||
const header = [
|
||
'Malli'.padEnd(40),
|
||
'Skenaario'.padEnd(10),
|
||
'Speksi'.padEnd(8),
|
||
'Testit'.padEnd(10),
|
||
'Korjaus'.padEnd(8),
|
||
'Ctx'.padEnd(7),
|
||
'Aika'.padEnd(8),
|
||
'tok/s'.padEnd(8),
|
||
'Pisteet',
|
||
].join(' │ ');
|
||
console.log(`║ ${header} ║`);
|
||
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
|
||
|
||
for (const r of results) {
|
||
const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗';
|
||
const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
|
||
const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
|
||
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
|
||
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
||
const speed = `${r.avgTokPerSec.toFixed(0)}`;
|
||
const row = [
|
||
r.model.padEnd(40),
|
||
r.scenario.padEnd(10),
|
||
specStatus.padEnd(8),
|
||
testStatus.padEnd(10),
|
||
fixStatus.padEnd(8),
|
||
ctx.padEnd(7),
|
||
time.padEnd(8),
|
||
speed.padEnd(8),
|
||
`${r.stars} ${r.score}`,
|
||
].join(' │ ');
|
||
console.log(`║ ${row} ║`);
|
||
}
|
||
console.log('╚' + '═'.repeat(header.length + 2) + '╝');
|
||
|
||
// === Mallikohtainen yhteenveto ===
|
||
const modelNames = [...new Set(results.map(r => r.model))];
|
||
const scenarioIds = scenarios.map(s => s.id);
|
||
|
||
console.log('\n');
|
||
const mHeader = [
|
||
'Malli'.padEnd(35),
|
||
...scenarioIds.map(s => s.padEnd(22)),
|
||
'Yht.'.padEnd(8),
|
||
'Out'.padEnd(7),
|
||
'Aika'.padEnd(8),
|
||
'tok/s'.padEnd(7),
|
||
'Pisteet',
|
||
].join(' │ ');
|
||
console.log(mHeader);
|
||
console.log('─'.repeat(mHeader.length));
|
||
|
||
for (const model of modelNames) {
|
||
const mrs = results.filter(r => r.model === model);
|
||
const cols = scenarioIds.map(sid => {
|
||
const r = mrs.find(r => r.scenario === sid);
|
||
if (!r) return '-'.padEnd(22);
|
||
const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
|
||
const s = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
||
const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`;
|
||
return `${t} ${s} ${tok}`.padEnd(22);
|
||
});
|
||
const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0);
|
||
const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0);
|
||
const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0);
|
||
const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0);
|
||
const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0;
|
||
const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0;
|
||
const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0;
|
||
const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`;
|
||
const row = [
|
||
model.padEnd(35),
|
||
...cols,
|
||
`${totalPassed}/${totalTests}`.padEnd(8),
|
||
tokStr.padEnd(7),
|
||
`${(totalTime/1000).toFixed(0)}s`.padEnd(8),
|
||
`${avgSpeed}`.padEnd(7),
|
||
`${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`,
|
||
].join(' │ ');
|
||
console.log(row);
|
||
}
|
||
|
||
// Tallenna JSON + HTML-raportti
|
||
const jsonData = JSON.stringify(results, null, 2);
|
||
writeFileSync(`${OUTPUT_DIR}/results.json`, jsonData);
|
||
const templatePath = join(__dirname, 'report-template.html');
|
||
let htmlData = '';
|
||
if (existsSync(templatePath)) {
|
||
htmlData = readFileSync(templatePath, 'utf-8').replace('/*DATA_PLACEHOLDER*/[]', JSON.stringify(results));
|
||
writeFileSync(`${OUTPUT_DIR}/report.html`, htmlData);
|
||
console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`);
|
||
}
|
||
console.log(`JSON: ${OUTPUT_DIR}/results.json`);
|
||
|
||
// Kopioi results/-kansioon aikaleimalla
|
||
mkdirSync(RESULTS_DIR, { recursive: true });
|
||
writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.json`), jsonData);
|
||
if (htmlData) writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.html`), htmlData);
|
||
console.log(`Arkistoitu: results/${TIMESTAMP}.json`);
|
||
|
||
// Yhteenveto
|
||
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
|
||
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
|
||
const failed = results.filter(r => r.error || r.testsTotal === 0);
|
||
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
|
||
const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
|
||
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
|
||
|
||
// === Kierrosyhteenveto (kun rounds > 1) ===
|
||
if (ROUNDS > 1) {
|
||
console.log('\n\n╔══════════════════════════════════════════════╗');
|
||
console.log('║ KIERROSYHTEENVETO (mediaani) ║');
|
||
console.log('╚══════════════════════════════════════════════╝\n');
|
||
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
|
||
|
||
for (const model of modelNames) {
|
||
const mrs = results.filter(r => r.model === model);
|
||
for (const sid of scenarioIds) {
|
||
const runs = mrs.filter(r => r.scenario === sid);
|
||
if (runs.length === 0) continue;
|
||
const scores = runs.map(r => r.score);
|
||
const med = median(scores);
|
||
const min = Math.min(...scores);
|
||
const max = Math.max(...scores);
|
||
const passRates = runs.map(r => r.testsTotal > 0 ? Math.round(r.testsPassed/r.testsTotal*100) : 0);
|
||
console.log(`${model.padEnd(30)} ${sid.padEnd(8)} ${starsForScore(med)} med:${med}p min:${min} max:${max} pass:[${passRates.join(',')}]%`);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
main().catch(e => { console.error(e); process.exit(1); });
|