Files
agentic-studio/kipina-codebench/benchmark.mjs
jaakko a32c4787f8 CodeBench: plaintext-speksi pienille malleille
- spec-plain.md: "entity Author (authors): name string, email string"
- extractPlainSpec() parseri plaintext → {entities, relationships}
- Small-profiili käyttää plain-formaattia, large JSON
- specText muuttuja: plaintext tai JSON prompteihin
- Ei voi mennä syntaktisesti rikki kuten JSON
2026-04-15 00:37:34 +03:00

1013 lines
52 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* Kipinä CodeBench — LLM-koodingenerointibenchmark
*
* Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa.
*
* Käyttö:
* node benchmark.mjs # kaikki mallit, oletusskenaario
* node benchmark.mjs --models qwen3-coder:30b # yksi malli
* node benchmark.mjs --ollama http://host:11434 # eri Ollama
* node benchmark.mjs --scenarios all # kaikki skenaariot
* node benchmark.mjs --output ./results/run-001 # custom output-hakemisto
*/
import { execSync } from 'child_process';
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
// === CLI-argumentit ===
const args = process.argv.slice(2);
function arg(name, fallback) {
const i = args.indexOf(`--${name}`);
return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
}
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://127.0.0.1:11434');
const HUB_URL = arg('hub', '');
const FILTER_MODELS = arg('models', '');
const SCENARIO_FILTER = arg('scenarios', 'default');
const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`);
const RESULTS_DIR = join(__dirname, 'results');
const THINK_MODE = args.includes('--think');
const COMPACT_MODE = args.includes('--compact');
const NO_ORCHESTRATE = args.includes('--no-orchestrate');
const FILE_BY_FILE = args.includes('--file-by-file');
const SPEC_MODEL = arg('spec-model', ''); // Eri malli spec-vaiheille (1-2)
const SPEC_OLLAMA = arg('spec-ollama', ''); // Eri Ollama spec-mallille
const CONVERT_MODEL = arg('convert-model', ''); // Malli Python→Go/Rust konvertointiin
const CONVERT_OLLAMA = arg('convert-ollama', ''); // Eri Ollama konvertointimallille
const LANG = arg('lang', 'python'); // python | rust | go
const ROUNDS = parseInt(arg('rounds', '1')); // 1-10 toistoa
const MAX_FIX_ROUNDS = 2;
// === Promptien lataus tiedostoista ===
function loadPrompt(name) {
const path = join(__dirname, 'prompts', `${name}.md`);
if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`);
return readFileSync(path, 'utf-8').trim();
}
const CLIENT_SYSTEM = loadPrompt('client');
const SPEC_SYSTEM = loadPrompt('spec');
const SPEC_SIMPLE_SYSTEM = existsSync(join(__dirname, 'prompts', 'spec-simple.md')) ? loadPrompt('spec-simple') : SPEC_SYSTEM;
const SPEC_PLAIN_SYSTEM = existsSync(join(__dirname, 'prompts', 'spec-plain.md')) ? loadPrompt('spec-plain') : SPEC_SIMPLE_SYSTEM;
const FIX_SYSTEM = loadPrompt('fix');
// === Mallikohtaiset profiilit ===
const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8'));
function getGoldenForModel(model) {
const modelConf = PROFILES.models[model];
const goldenFile = modelConf?.golden || 'todo.md';
// Kielispesifi golden: todo.md → todo-rs.md / todo-go.md
const langSuffix = { rust: '-rs', go: '-go' }[LANG];
if (langSuffix) {
const langFile = goldenFile.replace(/\.md$/, `${langSuffix}.md`);
const langPath = join(GOLDEN_DIR, langFile);
if (existsSync(langPath)) return langFile;
}
return goldenFile;
}
function getCodePromptForModel(model) {
const modelConf = PROFILES.models[model];
const profile = modelConf?.profile || PROFILES.default_profile;
const promptName = modelConf?.prompt || PROFILES.profiles[profile]?.prompt || 'code';
const suffix = { rust: '-rs', go: '-go' }[LANG] || '';
// Kielisuffiksi priorisoituu: code-small-go > code-go > code-small > code
const candidates = [`${promptName}${suffix}`, `code${suffix}`, promptName, 'code'].filter(Boolean);
for (const name of candidates) {
const path = join(__dirname, 'prompts', `${name}.md`);
if (existsSync(path)) return { system: readFileSync(path, 'utf-8').trim(), promptName: name, profile };
}
return { system: loadPrompt('code'), promptName: 'code', profile: 'large' };
}
// === Kultaisten esimerkkien lataus (kielen mukaan) ===
const GOLDEN_DIR = join(__dirname, 'golden-examples');
const LANG_CONFIG = {
python: {
goldenDir: 'todo',
files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
dockerImage: 'kipina-pytest',
},
rust: {
goldenDir: 'todo-rs',
files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
dockerImage: 'kipina-cargo-test',
},
go: {
goldenDir: 'todo-go',
files: ['go.mod', 'models.go', 'handlers.go', 'main.go', 'handlers_test.go'],
required: ['go.mod', 'models.go', 'handlers.go', 'main.go', 'handlers_test.go'],
dockerImage: 'kipina-go-test',
// Tiedosto-kerrallaan generointi (--file-by-file): järjestys ja kuvaukset
fileByFile: [
{ name: 'models.go', desc: 'Go structs for all entities + Create/Update request types. Use json tags.' },
{ name: 'handlers.go', desc: 'Chi HTTP handlers as closures taking *sql.DB. Use RETURNING in INSERT/UPDATE. sql.ErrNoRows for 404.' },
{ name: 'main.go', desc: 'Chi router setup, InitDB with CREATE TABLE, main() entry point on port 3000.' },
{ name: 'handlers_test.go', desc: 'Tests using httptest.NewServer + :memory: SQLite. setupTestServer helper. CRUD tests per entity.' },
],
},
};
const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python;
function loadGoldenExample(model) {
// --compact: käytä tiivistettyä templaattia
if (COMPACT_MODE) {
const compactFile = { rust: 'golden-compact-rs.md', go: 'golden-compact-go.md' }[LANG] || 'golden-compact-py.md';
const compactPath = join(__dirname, 'prompts', compactFile);
if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n';
}
// Mallikohtainen golden example profiilista
const goldenFile = model ? getGoldenForModel(model) : ({ rust: 'todo-rs.md', go: 'todo-go.md' }[LANG] || 'todo.md');
const mdPath = join(GOLDEN_DIR, goldenFile);
if (existsSync(mdPath)) return '\n' + readFileSync(mdPath, 'utf-8').trim() + '\n';
// Fallback: erilliset tiedostot
const todoDir = join(GOLDEN_DIR, LCONF.goldenDir);
if (!existsSync(todoDir)) return '';
let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`;
for (const f of LCONF.files) {
const path = join(todoDir, f);
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
}
return example;
}
// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
function stripThinking(text) {
return text
.replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '') // gemma4
.replace(/<think>[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5
.trim();
}
// === Ollama / Hub -client ===
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048, ollamaUrl = null) {
const start = Date.now();
if (HUB_URL) {
const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
});
if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
const data = await resp.json();
const elapsed = Date.now() - start;
return {
text: stripThinking((data.response || '').trim()),
tokens: data.tokens_generated || 0,
durationMs: elapsed,
tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
};
}
// Suora Ollama-reitti
const messages = [];
if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
messages.push({ role: 'user', content: prompt });
const resp = await fetch(`${ollamaUrl || OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model,
messages,
stream: false,
think: THINK_MODE,
options: { num_predict: THINK_MODE ? maxTokens * 3 : maxTokens, num_ctx: 16384, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
}),
});
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
const data = await resp.json();
const elapsed = Date.now() - start;
const rawContent = (data.message?.content || '').trim();
const thinking = (data.message?.thinking || '').trim();
const text = stripThinking(rawContent || thinking);
const evalCount = data.eval_count || 0;
if (!rawContent && thinking) console.log(` ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`);
const evalDurationNs = data.eval_duration || 1;
const tokPerSec = evalCount / (evalDurationNs / 1e9);
return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
}
async function ollamaListModels() {
const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
const resp = await fetch(url);
if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
const data = await resp.json();
return (data.models || []).map(m => m.name);
}
// === Testitulosten parsinta (pytest + cargo test) ===
function parseTestOutput(output) {
// Pytest: "6 passed", "2 failed", "1 error"
const pyPassed = output.match(/(\d+) passed/);
const pyFailed = output.match(/(\d+) failed/);
const pyError = output.match(/(\d+) error/);
if (pyPassed || pyFailed) {
const passed = pyPassed ? parseInt(pyPassed[1]) : 0;
const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0);
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
}
// Cargo test: "test result: ok. 10 passed; 0 failed;"
const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/);
if (cargoMatch) {
const passed = parseInt(cargoMatch[1]);
const failed = parseInt(cargoMatch[2]);
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
}
// Go test: "--- PASS:" / "--- FAIL:"
const goPassed = (output.match(/--- PASS:/g) || []).length;
const goFailed = (output.match(/--- FAIL:/g) || []).length;
if (goPassed + goFailed > 0) {
return { testsPassed: goPassed, testsFailed: goFailed, testsTotal: goPassed + goFailed };
}
// Cargo/Go compilation error: count "error[E" or Go compile errors
const compileErrors = (output.match(/error\[E\d+\]/g) || []).length;
if (compileErrors > 0) {
return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors };
}
return { testsPassed: 0, testsFailed: 0, testsTotal: 0 };
}
// === Tiedostoparseri LLM-vastauksesta ===
function parseGeneratedFiles(text) {
const files = {};
const sections = text.split(/===\s*(\S+\.(?:py|toml|rs|go|mod))\s*===/);
for (let i = 1; i < sections.length - 1; i += 2) {
const name = sections[i];
let content = sections[i + 1].trim();
content = content.replace(/^```(?:python|toml|rust|go|gomod)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
if (content) files[name] = content + '\n';
}
return files;
}
// === Validaattori ===
function validateProjectCode(files) {
const issues = [];
for (const [fname, code] of Object.entries(files)) {
if (!fname.endsWith('.py')) continue;
const lines = code.split('\n');
for (const line of lines) {
if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`);
}
for (const line of lines) {
const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
if (!m) continue;
const srcCode = files[m[1] + '.py'];
if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
for (const name of names) {
if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
}
}
if (fname === 'schemas.py') {
if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
issues.push('ISSUE: schemas.py: date-import puuttuu');
if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
issues.push('ISSUE: schemas.py: datetime-import puuttuu');
}
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
}
}
return issues;
}
function extractJson(text) {
const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
if (m) text = m[1].trim();
let depth = 0, start = null;
for (let i = 0; i < text.length; i++) {
if (text[i] === '{') { if (depth === 0) start = i; depth++; }
else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
}
return null;
}
// Plaintext spec parseri: "entity Author (authors): name string, email string"
function extractPlainSpec(text) {
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
const projectLine = lines.find(l => /^project:/i.test(l));
const projectName = projectLine ? projectLine.replace(/^project:\s*/i, '').trim() : 'api';
const entities = [];
const relationships = [];
for (const line of lines) {
const m = line.match(/^entity\s+(\w+)\s*\((\w+)\):\s*(.+)/i);
if (!m) continue;
const [, name, tableName, fieldsStr] = m;
const fields = [];
for (const part of fieldsStr.split(',').map(s => s.trim())) {
const fm = part.match(/^(\w+)\s+(string|text|int|float|bool|date|datetime)(?:->(\w+))?(?:=(.+))?$/i);
if (!fm) continue;
const [, fname, ftype, fkTarget, fdefault] = fm;
fields.push({ name: fname, type: ftype.toLowerCase(), nullable: false, default: fdefault || null });
if (fkTarget) {
relationships.push({ from: name, field: fname, to: fkTarget });
}
}
entities.push({ name, table_name: tableName, fields });
}
if (entities.length === 0) return null;
return { project_name: projectName, entities, relationships };
}
// === Testiskenaariot ===
const SCENARIOS = [
{ id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
{ id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
{ id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
];
// === Pisteytys (0100) ja tähtiluokitus ===
function scoreResult(r) {
if (r.error && r.testsTotal === 0) return 0;
let score = 0;
if (r.specOk) score += 10;
if (!r.error || r.testsTotal > 0) score += 10;
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
score += Math.max(0, 20 - r.fixRounds * 10);
return Math.min(100, score);
}
function starsForScore(score) {
if (score >= 90) return '★★★★★';
if (score >= 70) return '★★★★☆';
if (score >= 50) return '★★★☆☆';
if (score >= 25) return '★★☆☆☆';
if (score > 0) return '★☆☆☆☆';
return '☆☆☆☆☆';
}
// === Pipeline: yhdelle mallille ja skenaariolle ===
async function runPipeline(model, scenario, round = 1) {
const result = {
model, scenario: scenario.id,
reqOk: false, specOk: false, specEntities: 0,
validationIssues: 0, fixRounds: 0,
testsTotal: 0, testsPassed: 0, testsFailed: 0,
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
promptChars: 0, promptTokensEst: 0,
score: 0, stars: '',
error: null,
};
const timings = [];
// Konvertointi-moodissa generoidaan Python-koodia ensin, sitten konvertoidaan
const { system: CODE_SYSTEM, promptName, profile } = (() => {
if (CONVERT_MODEL) {
// Python-prompti + golden example koodigenerointiin
const pyPrompt = loadPrompt('code');
return { system: pyPrompt, promptName: 'code (→convert)', profile: PROFILES.models[model]?.profile || PROFILES.default_profile };
}
return getCodePromptForModel(model);
})();
const roundSuffix = ROUNDS > 1 ? `__r${round}` : '';
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}${roundSuffix}`;
mkdirSync(dir, { recursive: true });
try {
// 1. Vaatimukset
const specModel = SPEC_MODEL || model;
console.log(` [1/5] Vaatimukset${SPEC_MODEL ? ` (${SPEC_MODEL})` : ''}...`);
const specUrl = SPEC_OLLAMA || null;
const req = await ollamaChat(specModel, scenario.prompt, CLIENT_SYSTEM, 2048, specUrl);
timings.push(req);
if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
result.reqOk = true;
writeFileSync(`${dir}/_requirements.txt`, req.text);
// 2. Speksi (small → plaintext, large → JSON)
const specProfile = PROFILES.models[specModel]?.profile || PROFILES.default_profile;
const usePlainSpec = specProfile === 'small';
const specPrompt = usePlainSpec ? SPEC_PLAIN_SYSTEM : SPEC_SYSTEM;
const specLabel = usePlainSpec ? 'plain' : 'JSON';
console.log(` [2/5] Speksi (${specLabel})...`);
const specMsg = usePlainSpec
? `${req.text}\n\nOutput the database schema for this project.`
: `${req.text}\n\nOutput a JSON spec for this project.`;
const specResp = await ollamaChat(specModel, specMsg, specPrompt, 4096, specUrl);
timings.push(specResp);
const spec = usePlainSpec ? extractPlainSpec(specResp.text) : extractJson(specResp.text);
if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'Speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
result.specOk = true;
result.specEntities = spec.entities.length;
writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
// Spec tekstimuodossa prompteihin
const specText = usePlainSpec
? spec.entities.map(e => `entity ${e.name} (${e.table_name}): ${e.fields.map(f => `${f.name} ${f.type}${f.default ? '=' + f.default : ''}`).join(', ')}`).join('\n')
+ (spec.relationships.length > 0 ? '\nrelationships: ' + spec.relationships.map(r => `${r.from}.${r.field} -> ${r.to}`).join(', ') : '')
: JSON.stringify(spec, null, 2);
// 3. LLM-koodigenerointi
// Konvertointi-moodissa: generoi Python ensin, golden+files Pythonista
const isConvert = !!CONVERT_MODEL;
const genConfig = isConvert ? LANG_CONFIG.python : LCONF;
const fileCount = genConfig.required.length;
const goldenExample = isConvert ? ('\n' + readFileSync(join(GOLDEN_DIR, 'todo.md'), 'utf-8').trim() + '\n') : loadGoldenExample(model);
const codeTokens = isConvert ? 8192 : (LANG === 'rust' ? 12288 : LANG === 'go' ? 10240 : 8192);
let files;
// File-by-file: generoi yksi tiedosto kerrallaan, build-validointi kun kaikki valmiina
if (FILE_BY_FILE && LCONF.fileByFile) {
const fbf = LCONF.fileByFile;
const MAX_BUILD_FIX = 2;
console.log(` [3/5] Koodigenerointi (file-by-file, ${fbf.length} tiedostoa)...`);
files = {};
let context = '';
// Vaihe 1: generoi jokainen tiedosto
for (const fileDef of fbf) {
const contextBlock = context ? `\nEXISTING CODE:\n${context}\n` : '';
const filePrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nSPECIFICATION:\n${specText}\n${contextBlock}\nWrite ONLY the file "${fileDef.name}": ${fileDef.desc}\nOutput raw code, no markdown fences, no explanations. Start with "package main".`;
console.log(` [3/5] → ${fileDef.name}...`);
const fileResp = await ollamaChat(model, filePrompt, CODE_SYSTEM, 2048);
timings.push(fileResp);
let code = fileResp.text
.replace(/^```(?:go|golang)?\s*\n?/m, '').replace(/\n?```\s*$/m, '')
.replace(/^===\s*\S+\s*===\s*\n?/m, '')
.replace(/^(?:Here|Sure|Below|This|The|I )[\s\S]*?(?=package\s)/m, '')
.trim();
if (code) {
files[fileDef.name] = code + '\n';
context += `=== ${fileDef.name} ===\n${code}\n\n`;
const loc = code.split('\n').length;
console.log(` [3/5] ${fileResp.tokens} tok, ${loc} lines, ${fileResp.tokPerSec.toFixed(0)} tok/s`);
}
}
// Vaihe 2: go build -validointi + per-tiedosto korjaus
if (LANG === 'go') {
for (let buildRound = 0; buildRound < MAX_BUILD_FIX; buildRound++) {
// Kirjoita kaikki tiedostot levylle
const goldenMod = readFileSync(join(GOLDEN_DIR, 'todo-go', 'go.mod'), 'utf-8');
const modName = spec.project_name?.replace(/[^a-z0-9-]/gi, '-') || 'generated-api';
writeFileSync(join(dir, 'go.mod'), goldenMod.replace(/^module\s+\S+/m, `module ${modName}`));
for (const [fn, c] of Object.entries(files)) {
writeFileSync(join(dir, fn), c);
}
try {
execSync(
`docker run --rm --entrypoint sh -v "${dir}:/src:ro" ${LCONF.dockerImage} -c "cp -r /src/* . && go mod tidy 2>&1 && go build ./... 2>&1"`,
{ timeout: 60000, encoding: 'utf-8', stdio: 'pipe' }
);
console.log(` [3/5] ✓ kääntyy`);
break;
} catch (e) {
const allErrors = (e.stdout || e.stderr || '').split('\n').filter(l => /\.go:\d+/.test(l));
if (allErrors.length === 0) { console.log(` [3/5] ⚠ build failed`); break; }
// Ryhmittele virheet tiedostoittain
const errorsByFile = {};
for (const line of allErrors) {
const m = line.match(/\.\/(\S+\.go):\d+/);
if (m) { (errorsByFile[m[1]] = errorsByFile[m[1]] || []).push(line); }
}
const filesToFix = Object.keys(errorsByFile).filter(f => !f.endsWith('_test.go'));
if (filesToFix.length === 0) break;
console.log(` [3/5] ✗ ${allErrors.length} errors in ${filesToFix.join(', ')} → fixing`);
for (const fname of filesToFix) {
const errors = errorsByFile[fname].slice(0, 10).join('\n');
const fixPrompt = `Fix the following Go compilation errors in "${fname}". Return ONLY the corrected file, no explanations.\n\nERRORS:\n${errors}\n\nCURRENT FILE:\n${files[fname]}\n\nOTHER FILES:\n${Object.entries(files).filter(([f]) => f !== fname).map(([f, c]) => `=== ${f} ===\n${c}`).join('\n\n')}`;
console.log(` [3/5] → ${fname} (fix)...`);
const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, 2048);
timings.push(fixResp);
let fixed = fixResp.text
.replace(/^```(?:go|golang)?\s*\n?/m, '').replace(/\n?```\s*$/m, '')
.replace(/^===\s*\S+\s*===\s*\n?/m, '')
.replace(/^(?:Here|Sure|Below|This|The|I )[\s\S]*?(?=package\s)/m, '')
.trim();
if (fixed) {
files[fname] = fixed + '\n';
console.log(` [3/5] ${fixResp.tokens} tok, ${fixed.split('\n').length} lines`);
}
result.fixRounds++;
}
}
}
}
// Päivitä context lopullisilla tiedostoilla
context = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
writeFileSync(`${dir}/_code_raw.txt`, context);
result.promptChars = CODE_SYSTEM.length + (context.length || 0);
result.promptTokensEst = Math.round(result.promptChars / 4);
}
// Orkestrointi: pilko entiteetti kerrallaan
else if (spec.entities.length > 1 && !NO_ORCHESTRATE) {
console.log(` [3/5] Koodigenerointi (orkestroitu, ${spec.entities.length} entiteettiä)...`);
files = {};
let cumulativeCode = '';
for (let ei = 0; ei < spec.entities.length; ei++) {
const entity = spec.entities[ei];
const isFirst = ei === 0;
const entitySpec = {
...spec,
entities: spec.entities.slice(0, ei + 1),
relationships: (spec.relationships || []).filter(r =>
spec.entities.slice(0, ei + 1).some(e => e.name === r.from)
),
};
let entityPrompt;
if (isFirst) {
entityPrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(entitySpec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files for the entity "${entity.name}". Follow the reference implementation patterns exactly.`;
} else {
entityPrompt = `${goldenExample}\n---\n\nEXISTING CODE (do not regenerate, only add to it):\n${cumulativeCode}\n\n---\n\nJSON SPECIFICATION (add entity "${entity.name}"):\n${JSON.stringify(entitySpec, null, 2)}\n\nAdd the entity "${entity.name}" to the existing code. Return ALL ${fileCount} files with === markers, including the existing entities. Follow the same patterns.`;
}
console.log(` [3/5] → ${entity.name}${isFirst ? '' : ' (+ ' + spec.entities.slice(0, ei).map(e => e.name).join(', ') + ')'}...`);
const entityResp = await ollamaChat(model, entityPrompt, CODE_SYSTEM, codeTokens);
timings.push(entityResp);
const entityFiles = parseGeneratedFiles(entityResp.text);
// Yhdistä — uudempi korvaa edellisen
for (const [fn, content] of Object.entries(entityFiles)) {
files[fn] = content;
}
cumulativeCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
}
writeFileSync(`${dir}/_code_raw.txt`, cumulativeCode);
result.promptChars = CODE_SYSTEM.length + cumulativeCode.length;
result.promptTokensEst = Math.round(result.promptChars / 4);
} else {
// Normaali: kaikki kerralla
console.log(` [3/5] Koodigenerointi (LLM)...`);
const codePrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nSPECIFICATION:\n${specText}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`;
result.promptChars = CODE_SYSTEM.length + codePrompt.length;
result.promptTokensEst = Math.round(result.promptChars / 4);
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens);
timings.push(codeResp);
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
files = parseGeneratedFiles(codeResp.text);
}
// Konvertointi: Python→Go/Rust isommalla mallilla
if (CONVERT_MODEL && files) {
const convertUrl = CONVERT_OLLAMA || null;
const convertPromptFile = `convert-${LANG}`;
const convertSystem = existsSync(join(__dirname, 'prompts', `${convertPromptFile}.md`))
? readFileSync(join(__dirname, 'prompts', `${convertPromptFile}.md`), 'utf-8').trim()
: `Convert this Python code to ${LANG}. Return all files with === markers.`;
const pyCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
console.log(` [3.5/5] Konvertointi Python→${LANG} (${CONVERT_MODEL})...`);
const convertResp = await ollamaChat(CONVERT_MODEL, pyCode, convertSystem, 12288, convertUrl);
timings.push(convertResp);
writeFileSync(`${dir}/_convert_raw.txt`, convertResp.text);
const convertedFiles = parseGeneratedFiles(convertResp.text);
if (Object.keys(convertedFiles).length > 0) {
files = convertedFiles;
const loc = Object.values(files).reduce((s, c) => s + c.split('\n').length, 0);
console.log(` [3.5/5] ${convertResp.tokens} tok, ${loc} lines, ${convertResp.tokPerSec.toFixed(0)} tok/s`);
} else {
console.log(` [3.5/5] ⚠ Konvertointi ei tuottanut tiedostoja`);
}
}
// Go: generoi go.mod golden examplen versiolla (ennen missing-tarkistusta)
if (LANG === 'go') {
const goldenMod = readFileSync(join(GOLDEN_DIR, 'todo-go', 'go.mod'), 'utf-8');
const modName = (files['go.mod']?.match(/^module\s+(\S+)/m)?.[1]) || spec.project_name?.replace(/[^a-z0-9-]/gi, '-') || 'generated-api';
files['go.mod'] = goldenMod.replace(/^module\s+\S+/m, `module ${modName}`);
}
const missing = LCONF.required.filter(f => !files[f]);
if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
// 4. Validointi + korjaussilmukka
let fixRound = 0;
if (LANG === 'rust') {
// Rust: cargo check Docker-kontissa ennen testejä
for (let checkRound = 0; checkRound < MAX_FIX_ROUNDS; checkRound++) {
// Kirjoita tiedostot levylle
for (const [fn, content] of Object.entries(files)) {
const filePath = join(dir, fn);
mkdirSync(dirname(filePath), { recursive: true });
writeFileSync(filePath, content);
}
console.log(` [4/5] Cargo check${checkRound > 0 ? ` (korjaus ${checkRound})` : ''}...`);
let checkOut = '';
try {
checkOut = execSync(
`docker run --rm --entrypoint sh -v "${dir}:/src:ro" -v kipina-cargo-registry:/usr/local/cargo/registry -v kipina-cargo-target:/work/target ${LCONF.dockerImage} -c "cp -r /src/* . && cargo check 2>&1"`,
{ timeout: 300000, encoding: 'utf-8' }
);
} catch (e) {
checkOut = e.stdout || e.stderr || e.message || '';
}
const compileErrors = checkOut.split('\n').filter(l => /^error/.test(l));
if (compileErrors.length === 0) break; // Kääntyy — jatka testeihin
console.log(` [4/5] ${compileErrors.length} käännösvirhettä — korjataan...`);
fixRound++;
const errorLines = checkOut.split('\n').filter(l => /^error|^\s+-->/.test(l)).slice(0, 30).join('\n');
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
const fixPrompt = `Fix the following Rust compilation errors. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`;
const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, 12288);
timings.push(fixResp);
const fixedFiles = parseGeneratedFiles(fixResp.text);
for (const [fn, content] of Object.entries(fixedFiles)) {
if (LCONF.required.includes(fn)) files[fn] = content;
}
}
}
if (LANG === 'python') {
let issues = validateProjectCode(files);
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
fixRound++;
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
const issuesByFile = {};
for (const issue of issues) {
const m = issue.match(/^ISSUE:\s*(\S+?):/);
const fname = m ? m[1] : 'unknown';
if (!issuesByFile[fname]) issuesByFile[fname] = [];
issuesByFile[fname].push(issue);
}
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
if (!files[fname]) continue;
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
timings.push(fixResp);
if (fixResp.text) {
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
}
}
issues = validateProjectCode(files);
}
result.validationIssues = issues.length;
}
result.fixRounds = fixRound;
// 5. Testit Docker-kontissa + itsekorjaava looppi (Taso 4)
const testLabel = { rust: 'Cargo test', go: 'Go test', python: 'Pytest' }[LANG] || 'Test';
const dockerTimeout = LANG === 'rust' ? 300000 : 120000;
const MAX_TEST_FIX = 3;
let bestFiles = { ...files }; // Paras versio tiedostoista
let bestPassed = -1; // Paras testitulos
let testFixRounds = 0; // Erillinen laskuri testikorjauksille
for (let testRound = 0; testRound <= MAX_TEST_FIX; testRound++) {
// Kirjoita tiedostot levylle
for (const [fn, content] of Object.entries(files)) {
const filePath = join(dir, fn);
mkdirSync(dirname(filePath), { recursive: true });
writeFileSync(filePath, content);
}
// Nopea staattinen analyysi ennen Docker-ajoa
const pyFiles = Object.keys(files).filter(f => f.endsWith('.py'));
if (LANG === 'python' && pyFiles.length > 0) {
let syntaxErrors = '';
for (const f of pyFiles) {
try {
execSync(`python3 -c "import py_compile; py_compile.compile('${join(dir, f)}', doraise=True)"`, { timeout: 5000, encoding: 'utf-8', stdio: 'pipe' });
} catch (e) {
syntaxErrors += `${f}: ${(e.stderr || e.message || '').split('\n').filter(l => l.includes('Error')).join('; ')}\n`;
}
}
if (syntaxErrors) {
console.log(` [5/5] ⚠ Syntaksivirhe — ohitetaan Docker`);
writeFileSync(`${dir}/_testout_${testRound}.txt`, `SYNTAX ERRORS:\n${syntaxErrors}`);
Object.assign(result, { testsPassed: 0, testsFailed: 1, testsTotal: 1 });
if (testRound >= MAX_TEST_FIX) { result.error = 'Syntaksivirhe'; break; }
console.log(` [5/5] Itsekorjaus: syntaksi...`);
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
const fixPrompt = `Fix the following syntax errors. Return ALL files with === markers.\n\nERRORS:\n${syntaxErrors}\n\nCURRENT CODE:\n${allCode}`;
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 8192);
timings.push(fixResp);
const fixedFiles = parseGeneratedFiles(fixResp.text);
for (const [fn, content] of Object.entries(fixedFiles)) {
if (LCONF.required.includes(fn)) files[fn] = content;
}
testFixRounds++;
continue;
}
}
const roundLabel = testRound > 0 ? ` (korjaus ${testRound}/${MAX_TEST_FIX})` : '';
console.log(` [5/5] ${testLabel}${roundLabel}...`);
let testOut = '';
try {
testOut = execSync(
`docker run --rm -v "${dir}:/src:ro" -v kipina-cargo-registry:/usr/local/cargo/registry -v kipina-cargo-target:/work/target ${LCONF.dockerImage} 2>&1`,
{ timeout: dockerTimeout, encoding: 'utf-8' }
);
} catch (e) {
testOut = e.stdout || e.stderr || e.message || '';
}
writeFileSync(`${dir}/_testout_${testRound}.txt`, testOut);
const testResult = parseTestOutput(testOut);
Object.assign(result, testResult);
// Seuraa parasta tulosta — revert jos korjaus huononsi
if (result.testsPassed > bestPassed) {
bestPassed = result.testsPassed;
bestFiles = { ...files };
} else if (testRound > 0 && result.testsPassed < bestPassed) {
console.log(` [5/5] ⚠ Korjaus huononsi (${result.testsPassed}/${result.testsTotal} < ${bestPassed}) — palautetaan paras versio`);
files = { ...bestFiles };
Object.assign(result, { testsPassed: bestPassed });
break;
}
// Kaikki testit läpi → valmis
if (result.testsTotal > 0 && result.testsPassed === result.testsTotal) break;
// Viimeinen kierros tai ei enää korjausmahdollisuutta
if (testRound >= MAX_TEST_FIX) {
if (result.testsTotal === 0) result.error = 'Testit kaatuivat';
break;
}
// Itsekorjaus: syötä virhe + koodi mallille
const errorLines = testOut.split('\n').filter(l => /^E |FAILED|ERROR|error\[E|--- FAIL|panic:|\.go:\d+/.test(l)).slice(0, 20).join('\n');
if (!errorLines) break; // Ei parsittavia virheitä
console.log(` [5/5] Itsekorjaus: ${result.testsFailed || 'virhe'}...`);
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
const fixPrompt = `The following test errors occurred. Fix the code so ALL tests pass. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`;
const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, LANG === 'rust' ? 12288 : LANG === 'go' ? 10240 : 8192);
timings.push(fixResp);
const fixedFiles = parseGeneratedFiles(fixResp.text);
for (const [fn, content] of Object.entries(fixedFiles)) {
if (LCONF.required.includes(fn)) files[fn] = content;
}
testFixRounds++;
}
// Kirjoita paras versio levylle
for (const [fn, content] of Object.entries(bestPassed >= 0 ? bestFiles : files)) {
const filePath = join(dir, fn);
mkdirSync(dirname(filePath), { recursive: true });
writeFileSync(filePath, content);
}
// fixRounds = vain testikorjaukset (cargo check -korjaukset erilliset vaihe 4:ssä)
result.fixRounds = testFixRounds;
} catch (e) {
result.error = e.message;
}
// Yhteenveto
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
result.score = scoreResult(result);
result.stars = starsForScore(result.score);
result.profile = profile;
result.promptName = promptName;
return result;
}
// === GPU-muistin tyhjennys ===
async function clearVram() {
try {
const psResp = await fetch(`${OLLAMA_URL}/api/ps`);
const psData = await psResp.json();
for (const m of (psData.models || [])) {
await fetch(`${OLLAMA_URL}/api/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model: m.name, keep_alive: 0 }),
});
console.log(` ♻ Vapautettu: ${m.name}`);
}
} catch (e) { /* ei kriittinen */ }
}
// === Main ===
async function main() {
console.log('╔══════════════════════════════════════════════╗');
console.log('║ Kipinä CodeBench ║');
console.log('╚══════════════════════════════════════════════╝');
console.log(`Ollama: ${OLLAMA_URL} 📝 ${LANG}${COMPACT_MODE ? ' (compact)' : ''}${THINK_MODE ? ' 🧠 thinking ON' : ''}`);
// Haetaan mallit
let models;
try {
models = await ollamaListModels();
} catch (e) {
console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
process.exit(1);
}
if (FILTER_MODELS) {
const filter = FILTER_MODELS.split(',').map(s => s.trim());
models = models.filter(m => filter.some(f => m.includes(f)));
}
console.log(`Mallit (${models.length}): ${models.join(', ')}`);
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS :
SCENARIOS.filter(s => s.id === SCENARIO_FILTER).length > 0 ? SCENARIOS.filter(s => s.id === SCENARIO_FILTER) :
[SCENARIOS[0]];
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
if (ROUNDS > 1) console.log(`Toistoja: ${ROUNDS}`);
console.log(`Tulokset: ${OUTPUT_DIR}/`);
console.log('');
// Puhdista output
rmSync(OUTPUT_DIR, { recursive: true, force: true });
mkdirSync(OUTPUT_DIR, { recursive: true });
const results = [];
for (let round = 1; round <= ROUNDS; round++) {
await clearVram();
if (ROUNDS > 1) console.log(`\n╔═══ Kierros ${round}/${ROUNDS} ═══╗`);
for (const model of models) {
for (const scenario of scenarios) {
const roundLabel = ROUNDS > 1 ? ` [${round}/${ROUNDS}]` : '';
console.log(`\n━━━ ${model} × ${scenario.id}${roundLabel} ━━━`);
const r = await runPipeline(model, scenario, round);
if (ROUNDS > 1) r.round = round;
results.push(r);
// Tallenna tulokset jokaisen kierroksen jälkeen
writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
const status = r.error ? `${r.error}` :
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` :
`${r.testsPassed}/${r.testsTotal}`;
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
console.log(`${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
}
}
// Väliraportti kierroksen jälkeen
if (ROUNDS > 1) {
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
console.log(`\n┌─── Tilanne kierroksen ${round}/${ROUNDS} jälkeen ───┐`);
for (const model of [...new Set(results.map(r => r.model))]) {
const mrs = results.filter(r => r.model === model);
for (const sid of scenarios.map(s => s.id)) {
const runs = mrs.filter(r => r.scenario === sid);
if (runs.length === 0) continue;
const scores = runs.map(r => r.score);
const med = median(scores);
const last = scores[scores.length - 1];
const trend = scores.length > 1 ? (last > scores[scores.length - 2] ? '▲' : last < scores[scores.length - 2] ? '▼' : '─') : '';
console.log(`${model.padEnd(28)} ${sid.padEnd(7)} ${starsForScore(med)} med:${String(med).padStart(3)}p [${scores.join(',')}] ${trend}`);
}
}
console.log(`${'─'.repeat(45)}`);
}
} // rounds
// === Tulostaulu ===
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
console.log('║ TULOKSET ║');
console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
const header = [
'Malli'.padEnd(40),
'Skenaario'.padEnd(10),
'Speksi'.padEnd(8),
'Testit'.padEnd(10),
'Korjaus'.padEnd(8),
'Ctx'.padEnd(7),
'Aika'.padEnd(8),
'tok/s'.padEnd(8),
'Pisteet',
].join(' │ ');
console.log(`${header}`);
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
for (const r of results) {
const specStatus = r.specOk ? `${r.specEntities}e` : '✗';
const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
const speed = `${r.avgTokPerSec.toFixed(0)}`;
const row = [
r.model.padEnd(40),
r.scenario.padEnd(10),
specStatus.padEnd(8),
testStatus.padEnd(10),
fixStatus.padEnd(8),
ctx.padEnd(7),
time.padEnd(8),
speed.padEnd(8),
`${r.stars} ${r.score}`,
].join(' │ ');
console.log(`${row}`);
}
console.log('╚' + '═'.repeat(header.length + 2) + '╝');
// === Mallikohtainen yhteenveto ===
const modelNames = [...new Set(results.map(r => r.model))];
const scenarioIds = scenarios.map(s => s.id);
console.log('\n');
const mHeader = [
'Malli'.padEnd(35),
...scenarioIds.map(s => s.padEnd(22)),
'Yht.'.padEnd(8),
'Out'.padEnd(7),
'Aika'.padEnd(8),
'tok/s'.padEnd(7),
'Pisteet',
].join(' │ ');
console.log(mHeader);
console.log('─'.repeat(mHeader.length));
for (const model of modelNames) {
const mrs = results.filter(r => r.model === model);
const cols = scenarioIds.map(sid => {
const r = mrs.find(r => r.scenario === sid);
if (!r) return '-'.padEnd(22);
const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
const s = `${(r.totalDurationMs/1000).toFixed(0)}s`;
const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`;
return `${t} ${s} ${tok}`.padEnd(22);
});
const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0);
const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0);
const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0);
const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0);
const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0;
const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0;
const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0;
const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`;
const row = [
model.padEnd(35),
...cols,
`${totalPassed}/${totalTests}`.padEnd(8),
tokStr.padEnd(7),
`${(totalTime/1000).toFixed(0)}s`.padEnd(8),
`${avgSpeed}`.padEnd(7),
`${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`,
].join(' │ ');
console.log(row);
}
// Tallenna JSON + HTML-raportti
const jsonData = JSON.stringify(results, null, 2);
writeFileSync(`${OUTPUT_DIR}/results.json`, jsonData);
const templatePath = join(__dirname, 'report-template.html');
let htmlData = '';
if (existsSync(templatePath)) {
htmlData = readFileSync(templatePath, 'utf-8').replace('/*DATA_PLACEHOLDER*/[]', JSON.stringify(results));
writeFileSync(`${OUTPUT_DIR}/report.html`, htmlData);
console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`);
}
console.log(`JSON: ${OUTPUT_DIR}/results.json`);
// Kopioi results/-kansioon aikaleimalla
mkdirSync(RESULTS_DIR, { recursive: true });
writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.json`), jsonData);
if (htmlData) writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.html`), htmlData);
console.log(`Arkistoitu: results/${TIMESTAMP}.json`);
// Yhteenveto
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
const failed = results.filter(r => r.error || r.testsTotal === 0);
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
// === Kierrosyhteenveto (kun rounds > 1) ===
if (ROUNDS > 1) {
console.log('\n\n╔══════════════════════════════════════════════╗');
console.log('║ KIERROSYHTEENVETO (mediaani) ║');
console.log('╚══════════════════════════════════════════════╝\n');
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
for (const model of modelNames) {
const mrs = results.filter(r => r.model === model);
for (const sid of scenarioIds) {
const runs = mrs.filter(r => r.scenario === sid);
if (runs.length === 0) continue;
const scores = runs.map(r => r.score);
const med = median(scores);
const min = Math.min(...scores);
const max = Math.max(...scores);
const passRates = runs.map(r => r.testsTotal > 0 ? Math.round(r.testsPassed/r.testsTotal*100) : 0);
console.log(`${model.padEnd(30)} ${sid.padEnd(8)} ${starsForScore(med)} med:${med}p min:${min} max:${max} pass:[${passRates.join(',')}]%`);
}
}
}
}
main().catch(e => { console.error(e); process.exit(1); });