- Generoi yksi tiedosto kerrallaan: models.go → handlers.go → main.go → tests - Edellisten tiedostojen koodi kontekstissa seuraavalle - Max 2048 tok per tiedosto (vs 10240 kaikki kerralla) - go.mod generoidaan aina golden examplesta (ei mallin tuotoksesta) - Promptissa "Write ONLY the file X" + "Start with package main"
879 lines
44 KiB
JavaScript
879 lines
44 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Kipinä CodeBench — LLM-koodingenerointibenchmark
|
||
*
|
||
* Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa.
|
||
*
|
||
* Käyttö:
|
||
* node benchmark.mjs # kaikki mallit, oletusskenaario
|
||
* node benchmark.mjs --models qwen3-coder:30b # yksi malli
|
||
* node benchmark.mjs --ollama http://host:11434 # eri Ollama
|
||
* node benchmark.mjs --scenarios all # kaikki skenaariot
|
||
* node benchmark.mjs --output ./results/run-001 # custom output-hakemisto
|
||
*/
|
||
|
||
import { execSync } from 'child_process';
|
||
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs';
|
||
import { dirname, join } from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
|
||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||
|
||
// === CLI-argumentit ===
|
||
const args = process.argv.slice(2);
|
||
function arg(name, fallback) {
|
||
const i = args.indexOf(`--${name}`);
|
||
return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
|
||
}
|
||
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://127.0.0.1:11434');
|
||
const HUB_URL = arg('hub', '');
|
||
const FILTER_MODELS = arg('models', '');
|
||
const SCENARIO_FILTER = arg('scenarios', 'default');
|
||
const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
||
const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`);
|
||
const RESULTS_DIR = join(__dirname, 'results');
|
||
const THINK_MODE = args.includes('--think');
|
||
const COMPACT_MODE = args.includes('--compact');
|
||
const NO_ORCHESTRATE = args.includes('--no-orchestrate');
|
||
const FILE_BY_FILE = args.includes('--file-by-file');
|
||
const SPEC_MODEL = arg('spec-model', ''); // Eri malli spec-vaiheille (1-2)
|
||
const SPEC_OLLAMA = arg('spec-ollama', ''); // Eri Ollama spec-mallille
|
||
const LANG = arg('lang', 'python'); // python | rust | go
|
||
const ROUNDS = parseInt(arg('rounds', '1')); // 1-10 toistoa
|
||
const MAX_FIX_ROUNDS = 2;
|
||
|
||
// === Promptien lataus tiedostoista ===
|
||
function loadPrompt(name) {
|
||
const path = join(__dirname, 'prompts', `${name}.md`);
|
||
if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`);
|
||
return readFileSync(path, 'utf-8').trim();
|
||
}
|
||
const CLIENT_SYSTEM = loadPrompt('client');
|
||
const SPEC_SYSTEM = loadPrompt('spec');
|
||
const SPEC_SIMPLE_SYSTEM = existsSync(join(__dirname, 'prompts', 'spec-simple.md')) ? loadPrompt('spec-simple') : SPEC_SYSTEM;
|
||
const FIX_SYSTEM = loadPrompt('fix');
|
||
|
||
// === Mallikohtaiset profiilit ===
|
||
const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8'));
|
||
function getGoldenForModel(model) {
|
||
const modelConf = PROFILES.models[model];
|
||
const goldenFile = modelConf?.golden || 'todo.md';
|
||
// Kielispesifi golden: todo.md → todo-rs.md / todo-go.md
|
||
const langSuffix = { rust: '-rs', go: '-go' }[LANG];
|
||
if (langSuffix) {
|
||
const langFile = goldenFile.replace(/\.md$/, `${langSuffix}.md`);
|
||
const langPath = join(GOLDEN_DIR, langFile);
|
||
if (existsSync(langPath)) return langFile;
|
||
}
|
||
return goldenFile;
|
||
}
|
||
|
||
function getCodePromptForModel(model) {
|
||
const modelConf = PROFILES.models[model];
|
||
const profile = modelConf?.profile || PROFILES.default_profile;
|
||
const promptName = modelConf?.prompt || PROFILES.profiles[profile]?.prompt || 'code';
|
||
const suffix = { rust: '-rs', go: '-go' }[LANG] || '';
|
||
// Kielisuffiksi priorisoituu: code-small-go > code-go > code-small > code
|
||
const candidates = [`${promptName}${suffix}`, `code${suffix}`, promptName, 'code'].filter(Boolean);
|
||
for (const name of candidates) {
|
||
const path = join(__dirname, 'prompts', `${name}.md`);
|
||
if (existsSync(path)) return { system: readFileSync(path, 'utf-8').trim(), promptName: name, profile };
|
||
}
|
||
return { system: loadPrompt('code'), promptName: 'code', profile: 'large' };
|
||
}
|
||
|
||
// === Kultaisten esimerkkien lataus (kielen mukaan) ===
|
||
const GOLDEN_DIR = join(__dirname, 'golden-examples');
|
||
const LANG_CONFIG = {
|
||
python: {
|
||
goldenDir: 'todo',
|
||
files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
|
||
required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
|
||
dockerImage: 'kipina-pytest',
|
||
},
|
||
rust: {
|
||
goldenDir: 'todo-rs',
|
||
files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
|
||
required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
|
||
dockerImage: 'kipina-cargo-test',
|
||
},
|
||
go: {
|
||
goldenDir: 'todo-go',
|
||
files: ['go.mod', 'models.go', 'handlers.go', 'main.go', 'handlers_test.go'],
|
||
required: ['go.mod', 'models.go', 'handlers.go', 'main.go', 'handlers_test.go'],
|
||
dockerImage: 'kipina-go-test',
|
||
// Tiedosto-kerrallaan generointi (--file-by-file): järjestys ja kuvaukset
|
||
fileByFile: [
|
||
{ name: 'models.go', desc: 'Go structs for all entities + Create/Update request types. Use json tags.' },
|
||
{ name: 'handlers.go', desc: 'Chi HTTP handlers as closures taking *sql.DB. Use RETURNING in INSERT/UPDATE. sql.ErrNoRows for 404.' },
|
||
{ name: 'main.go', desc: 'Chi router setup, InitDB with CREATE TABLE, main() entry point on port 3000.' },
|
||
{ name: 'handlers_test.go', desc: 'Tests using httptest.NewServer + :memory: SQLite. setupTestServer helper. CRUD tests per entity.' },
|
||
],
|
||
},
|
||
};
|
||
const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python;
|
||
|
||
function loadGoldenExample(model) {
|
||
// --compact: käytä tiivistettyä templaattia
|
||
if (COMPACT_MODE) {
|
||
const compactFile = { rust: 'golden-compact-rs.md', go: 'golden-compact-go.md' }[LANG] || 'golden-compact-py.md';
|
||
const compactPath = join(__dirname, 'prompts', compactFile);
|
||
if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n';
|
||
}
|
||
// Mallikohtainen golden example profiilista
|
||
const goldenFile = model ? getGoldenForModel(model) : ({ rust: 'todo-rs.md', go: 'todo-go.md' }[LANG] || 'todo.md');
|
||
const mdPath = join(GOLDEN_DIR, goldenFile);
|
||
if (existsSync(mdPath)) return '\n' + readFileSync(mdPath, 'utf-8').trim() + '\n';
|
||
// Fallback: erilliset tiedostot
|
||
const todoDir = join(GOLDEN_DIR, LCONF.goldenDir);
|
||
if (!existsSync(todoDir)) return '';
|
||
let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`;
|
||
for (const f of LCONF.files) {
|
||
const path = join(todoDir, f);
|
||
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
|
||
}
|
||
return example;
|
||
}
|
||
|
||
// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
|
||
function stripThinking(text) {
|
||
return text
|
||
.replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '') // gemma4
|
||
.replace(/<think>[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5
|
||
.trim();
|
||
}
|
||
|
||
// === Ollama / Hub -client ===
|
||
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048, ollamaUrl = null) {
|
||
const start = Date.now();
|
||
|
||
if (HUB_URL) {
|
||
const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
|
||
const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
|
||
});
|
||
if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
|
||
const data = await resp.json();
|
||
const elapsed = Date.now() - start;
|
||
return {
|
||
text: stripThinking((data.response || '').trim()),
|
||
tokens: data.tokens_generated || 0,
|
||
durationMs: elapsed,
|
||
tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
|
||
};
|
||
}
|
||
|
||
// Suora Ollama-reitti
|
||
const messages = [];
|
||
if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
|
||
messages.push({ role: 'user', content: prompt });
|
||
|
||
const resp = await fetch(`${ollamaUrl || OLLAMA_URL}/api/chat`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({
|
||
model,
|
||
messages,
|
||
stream: false,
|
||
think: THINK_MODE,
|
||
options: { num_predict: THINK_MODE ? maxTokens * 3 : maxTokens, num_ctx: 16384, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
|
||
}),
|
||
});
|
||
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
|
||
const data = await resp.json();
|
||
const elapsed = Date.now() - start;
|
||
const rawContent = (data.message?.content || '').trim();
|
||
const thinking = (data.message?.thinking || '').trim();
|
||
const text = stripThinking(rawContent || thinking);
|
||
const evalCount = data.eval_count || 0;
|
||
if (!rawContent && thinking) console.log(` ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`);
|
||
const evalDurationNs = data.eval_duration || 1;
|
||
const tokPerSec = evalCount / (evalDurationNs / 1e9);
|
||
return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
|
||
}
|
||
|
||
async function ollamaListModels() {
|
||
const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
|
||
const resp = await fetch(url);
|
||
if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
|
||
const data = await resp.json();
|
||
return (data.models || []).map(m => m.name);
|
||
}
|
||
|
||
// === Testitulosten parsinta (pytest + cargo test) ===
|
||
function parseTestOutput(output) {
|
||
// Pytest: "6 passed", "2 failed", "1 error"
|
||
const pyPassed = output.match(/(\d+) passed/);
|
||
const pyFailed = output.match(/(\d+) failed/);
|
||
const pyError = output.match(/(\d+) error/);
|
||
if (pyPassed || pyFailed) {
|
||
const passed = pyPassed ? parseInt(pyPassed[1]) : 0;
|
||
const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0);
|
||
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
|
||
}
|
||
// Cargo test: "test result: ok. 10 passed; 0 failed;"
|
||
const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/);
|
||
if (cargoMatch) {
|
||
const passed = parseInt(cargoMatch[1]);
|
||
const failed = parseInt(cargoMatch[2]);
|
||
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
|
||
}
|
||
// Go test: "--- PASS:" / "--- FAIL:"
|
||
const goPassed = (output.match(/--- PASS:/g) || []).length;
|
||
const goFailed = (output.match(/--- FAIL:/g) || []).length;
|
||
if (goPassed + goFailed > 0) {
|
||
return { testsPassed: goPassed, testsFailed: goFailed, testsTotal: goPassed + goFailed };
|
||
}
|
||
// Cargo/Go compilation error: count "error[E" or Go compile errors
|
||
const compileErrors = (output.match(/error\[E\d+\]/g) || []).length;
|
||
if (compileErrors > 0) {
|
||
return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors };
|
||
}
|
||
return { testsPassed: 0, testsFailed: 0, testsTotal: 0 };
|
||
}
|
||
|
||
// === Tiedostoparseri LLM-vastauksesta ===
|
||
function parseGeneratedFiles(text) {
|
||
const files = {};
|
||
const sections = text.split(/===\s*(\S+\.(?:py|toml|rs|go|mod))\s*===/);
|
||
for (let i = 1; i < sections.length - 1; i += 2) {
|
||
const name = sections[i];
|
||
let content = sections[i + 1].trim();
|
||
content = content.replace(/^```(?:python|toml|rust|go|gomod)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
|
||
if (content) files[name] = content + '\n';
|
||
}
|
||
return files;
|
||
}
|
||
|
||
// === Validaattori ===
|
||
function validateProjectCode(files) {
|
||
const issues = [];
|
||
for (const [fname, code] of Object.entries(files)) {
|
||
if (!fname.endsWith('.py')) continue;
|
||
const lines = code.split('\n');
|
||
for (const line of lines) {
|
||
if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`);
|
||
}
|
||
for (const line of lines) {
|
||
const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
|
||
if (!m) continue;
|
||
const srcCode = files[m[1] + '.py'];
|
||
if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
|
||
const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
|
||
for (const name of names) {
|
||
if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
|
||
}
|
||
}
|
||
if (fname === 'schemas.py') {
|
||
if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
|
||
issues.push('ISSUE: schemas.py: date-import puuttuu');
|
||
if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
|
||
issues.push('ISSUE: schemas.py: datetime-import puuttuu');
|
||
}
|
||
for (let i = 0; i < lines.length; i++) {
|
||
const line = lines[i];
|
||
if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
|
||
if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
|
||
if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
|
||
}
|
||
}
|
||
return issues;
|
||
}
|
||
|
||
function extractJson(text) {
|
||
const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
|
||
if (m) text = m[1].trim();
|
||
let depth = 0, start = null;
|
||
for (let i = 0; i < text.length; i++) {
|
||
if (text[i] === '{') { if (depth === 0) start = i; depth++; }
|
||
else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
|
||
}
|
||
return null;
|
||
}
|
||
|
||
// === Testiskenaariot ===
|
||
const SCENARIOS = [
|
||
{ id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
|
||
{ id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
|
||
{ id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
|
||
];
|
||
|
||
// === Pisteytys (0–100) ja tähtiluokitus ===
|
||
function scoreResult(r) {
|
||
if (r.error && r.testsTotal === 0) return 0;
|
||
let score = 0;
|
||
if (r.specOk) score += 10;
|
||
if (!r.error || r.testsTotal > 0) score += 10;
|
||
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||
score += Math.max(0, 20 - r.fixRounds * 10);
|
||
return Math.min(100, score);
|
||
}
|
||
function starsForScore(score) {
|
||
if (score >= 90) return '★★★★★';
|
||
if (score >= 70) return '★★★★☆';
|
||
if (score >= 50) return '★★★☆☆';
|
||
if (score >= 25) return '★★☆☆☆';
|
||
if (score > 0) return '★☆☆☆☆';
|
||
return '☆☆☆☆☆';
|
||
}
|
||
|
||
// === Pipeline: yhdelle mallille ja skenaariolle ===
|
||
async function runPipeline(model, scenario, round = 1) {
|
||
const result = {
|
||
model, scenario: scenario.id,
|
||
reqOk: false, specOk: false, specEntities: 0,
|
||
validationIssues: 0, fixRounds: 0,
|
||
testsTotal: 0, testsPassed: 0, testsFailed: 0,
|
||
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
|
||
promptChars: 0, promptTokensEst: 0,
|
||
score: 0, stars: '',
|
||
error: null,
|
||
};
|
||
const timings = [];
|
||
const { system: CODE_SYSTEM, promptName, profile } = getCodePromptForModel(model);
|
||
const roundSuffix = ROUNDS > 1 ? `__r${round}` : '';
|
||
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}${roundSuffix}`;
|
||
mkdirSync(dir, { recursive: true });
|
||
|
||
try {
|
||
// 1. Vaatimukset
|
||
const specModel = SPEC_MODEL || model;
|
||
console.log(` [1/5] Vaatimukset${SPEC_MODEL ? ` (${SPEC_MODEL})` : ''}...`);
|
||
const specUrl = SPEC_OLLAMA || null;
|
||
const req = await ollamaChat(specModel, scenario.prompt, CLIENT_SYSTEM, 2048, specUrl);
|
||
timings.push(req);
|
||
if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
|
||
result.reqOk = true;
|
||
writeFileSync(`${dir}/_requirements.txt`, req.text);
|
||
|
||
// 2. JSON-speksi (small-malleille yksinkertaistettu skeema)
|
||
const specProfile = PROFILES.models[specModel]?.profile || PROFILES.default_profile;
|
||
const specPrompt = specProfile === 'small' ? SPEC_SIMPLE_SYSTEM : SPEC_SYSTEM;
|
||
console.log(` [2/5] JSON-speksi${specProfile === 'small' ? ' (simple)' : ''}...`);
|
||
const specResp = await ollamaChat(specModel, `${req.text}\n\nOutput a JSON spec for this project.`, specPrompt, 4096, specUrl);
|
||
timings.push(specResp);
|
||
const spec = extractJson(specResp.text);
|
||
if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
|
||
result.specOk = true;
|
||
result.specEntities = spec.entities.length;
|
||
writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
|
||
|
||
// 3. LLM-koodigenerointi
|
||
const fileCount = LCONF.required.length;
|
||
const goldenExample = loadGoldenExample(model);
|
||
const codeTokens = LANG === 'rust' ? 12288 : LANG === 'go' ? 10240 : 8192;
|
||
let files;
|
||
|
||
// File-by-file: generoi yksi tiedosto kerrallaan (pienille malleille)
|
||
if (FILE_BY_FILE && LCONF.fileByFile) {
|
||
const fbf = LCONF.fileByFile;
|
||
console.log(` [3/5] Koodigenerointi (file-by-file, ${fbf.length} tiedostoa)...`);
|
||
files = {};
|
||
let context = '';
|
||
|
||
for (const fileDef of fbf) {
|
||
const contextBlock = context ? `\nEXISTING CODE:\n${context}\n` : '';
|
||
const filePrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n${contextBlock}\nWrite ONLY the file "${fileDef.name}": ${fileDef.desc}\nOutput raw code, no markdown fences, no explanations. Start with "package main".`;
|
||
|
||
console.log(` [3/5] → ${fileDef.name}...`);
|
||
const fileResp = await ollamaChat(model, filePrompt, CODE_SYSTEM, 2048);
|
||
timings.push(fileResp);
|
||
|
||
// Siivoa: poista markdown-aidat ja selitysteksti
|
||
let code = fileResp.text
|
||
.replace(/^```(?:go|golang)?\s*\n?/m, '').replace(/\n?```\s*$/m, '')
|
||
.replace(/^(?:Here|Sure|Below|This|The|I )[\s\S]*?(?=package\s)/m, '')
|
||
.trim();
|
||
if (code) {
|
||
files[fileDef.name] = code + '\n';
|
||
context += `=== ${fileDef.name} ===\n${code}\n\n`;
|
||
}
|
||
}
|
||
writeFileSync(`${dir}/_code_raw.txt`, context);
|
||
result.promptChars = CODE_SYSTEM.length + (context.length || 0);
|
||
result.promptTokensEst = Math.round(result.promptChars / 4);
|
||
}
|
||
// Orkestrointi: pilko entiteetti kerrallaan
|
||
else if (spec.entities.length > 1 && !NO_ORCHESTRATE) {
|
||
console.log(` [3/5] Koodigenerointi (orkestroitu, ${spec.entities.length} entiteettiä)...`);
|
||
files = {};
|
||
let cumulativeCode = '';
|
||
|
||
for (let ei = 0; ei < spec.entities.length; ei++) {
|
||
const entity = spec.entities[ei];
|
||
const isFirst = ei === 0;
|
||
const entitySpec = {
|
||
...spec,
|
||
entities: spec.entities.slice(0, ei + 1),
|
||
relationships: (spec.relationships || []).filter(r =>
|
||
spec.entities.slice(0, ei + 1).some(e => e.name === r.from)
|
||
),
|
||
};
|
||
|
||
let entityPrompt;
|
||
if (isFirst) {
|
||
entityPrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(entitySpec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files for the entity "${entity.name}". Follow the reference implementation patterns exactly.`;
|
||
} else {
|
||
entityPrompt = `${goldenExample}\n---\n\nEXISTING CODE (do not regenerate, only add to it):\n${cumulativeCode}\n\n---\n\nJSON SPECIFICATION (add entity "${entity.name}"):\n${JSON.stringify(entitySpec, null, 2)}\n\nAdd the entity "${entity.name}" to the existing code. Return ALL ${fileCount} files with === markers, including the existing entities. Follow the same patterns.`;
|
||
}
|
||
|
||
console.log(` [3/5] → ${entity.name}${isFirst ? '' : ' (+ ' + spec.entities.slice(0, ei).map(e => e.name).join(', ') + ')'}...`);
|
||
const entityResp = await ollamaChat(model, entityPrompt, CODE_SYSTEM, codeTokens);
|
||
timings.push(entityResp);
|
||
|
||
const entityFiles = parseGeneratedFiles(entityResp.text);
|
||
// Yhdistä — uudempi korvaa edellisen
|
||
for (const [fn, content] of Object.entries(entityFiles)) {
|
||
files[fn] = content;
|
||
}
|
||
cumulativeCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
|
||
}
|
||
writeFileSync(`${dir}/_code_raw.txt`, cumulativeCode);
|
||
result.promptChars = CODE_SYSTEM.length + cumulativeCode.length;
|
||
result.promptTokensEst = Math.round(result.promptChars / 4);
|
||
} else {
|
||
// Normaali: kaikki kerralla
|
||
console.log(` [3/5] Koodigenerointi (LLM)...`);
|
||
const codePrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`;
|
||
result.promptChars = CODE_SYSTEM.length + codePrompt.length;
|
||
result.promptTokensEst = Math.round(result.promptChars / 4);
|
||
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens);
|
||
timings.push(codeResp);
|
||
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
|
||
files = parseGeneratedFiles(codeResp.text);
|
||
}
|
||
|
||
const missing = LCONF.required.filter(f => !files[f]);
|
||
if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
|
||
|
||
// Go: korvaa/generoi go.mod golden examplen versiolla
|
||
if (LANG === 'go') {
|
||
const goldenMod = readFileSync(join(GOLDEN_DIR, 'todo-go', 'go.mod'), 'utf-8');
|
||
const modName = (files['go.mod']?.match(/^module\s+(\S+)/m)?.[1]) || spec.project_name?.replace(/[^a-z0-9-]/gi, '-') || 'generated-api';
|
||
files['go.mod'] = goldenMod.replace(/^module\s+\S+/m, `module ${modName}`);
|
||
}
|
||
|
||
// 4. Validointi + korjaussilmukka
|
||
let fixRound = 0;
|
||
if (LANG === 'rust') {
|
||
// Rust: cargo check Docker-kontissa ennen testejä
|
||
for (let checkRound = 0; checkRound < MAX_FIX_ROUNDS; checkRound++) {
|
||
// Kirjoita tiedostot levylle
|
||
for (const [fn, content] of Object.entries(files)) {
|
||
const filePath = join(dir, fn);
|
||
mkdirSync(dirname(filePath), { recursive: true });
|
||
writeFileSync(filePath, content);
|
||
}
|
||
console.log(` [4/5] Cargo check${checkRound > 0 ? ` (korjaus ${checkRound})` : ''}...`);
|
||
let checkOut = '';
|
||
try {
|
||
checkOut = execSync(
|
||
`docker run --rm --entrypoint sh -v "${dir}:/src:ro" -v kipina-cargo-registry:/usr/local/cargo/registry -v kipina-cargo-target:/work/target ${LCONF.dockerImage} -c "cp -r /src/* . && cargo check 2>&1"`,
|
||
{ timeout: 300000, encoding: 'utf-8' }
|
||
);
|
||
} catch (e) {
|
||
checkOut = e.stdout || e.stderr || e.message || '';
|
||
}
|
||
const compileErrors = checkOut.split('\n').filter(l => /^error/.test(l));
|
||
if (compileErrors.length === 0) break; // Kääntyy — jatka testeihin
|
||
|
||
console.log(` [4/5] ${compileErrors.length} käännösvirhettä — korjataan...`);
|
||
fixRound++;
|
||
const errorLines = checkOut.split('\n').filter(l => /^error|^\s+-->/.test(l)).slice(0, 30).join('\n');
|
||
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
|
||
const fixPrompt = `Fix the following Rust compilation errors. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`;
|
||
const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, 12288);
|
||
timings.push(fixResp);
|
||
const fixedFiles = parseGeneratedFiles(fixResp.text);
|
||
for (const [fn, content] of Object.entries(fixedFiles)) {
|
||
if (LCONF.required.includes(fn)) files[fn] = content;
|
||
}
|
||
}
|
||
}
|
||
if (LANG === 'python') {
|
||
let issues = validateProjectCode(files);
|
||
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
|
||
fixRound++;
|
||
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
|
||
const issuesByFile = {};
|
||
for (const issue of issues) {
|
||
const m = issue.match(/^ISSUE:\s*(\S+?):/);
|
||
const fname = m ? m[1] : 'unknown';
|
||
if (!issuesByFile[fname]) issuesByFile[fname] = [];
|
||
issuesByFile[fname].push(issue);
|
||
}
|
||
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
|
||
if (!files[fname]) continue;
|
||
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
|
||
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
|
||
timings.push(fixResp);
|
||
if (fixResp.text) {
|
||
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
|
||
}
|
||
}
|
||
issues = validateProjectCode(files);
|
||
}
|
||
result.validationIssues = issues.length;
|
||
}
|
||
result.fixRounds = fixRound;
|
||
|
||
// 5. Testit Docker-kontissa + itsekorjaava looppi (Taso 4)
|
||
const testLabel = { rust: 'Cargo test', go: 'Go test', python: 'Pytest' }[LANG] || 'Test';
|
||
const dockerTimeout = LANG === 'rust' ? 300000 : 120000;
|
||
const MAX_TEST_FIX = 3;
|
||
let bestFiles = { ...files }; // Paras versio tiedostoista
|
||
let bestPassed = -1; // Paras testitulos
|
||
let testFixRounds = 0; // Erillinen laskuri testikorjauksille
|
||
|
||
for (let testRound = 0; testRound <= MAX_TEST_FIX; testRound++) {
|
||
// Kirjoita tiedostot levylle
|
||
for (const [fn, content] of Object.entries(files)) {
|
||
const filePath = join(dir, fn);
|
||
mkdirSync(dirname(filePath), { recursive: true });
|
||
writeFileSync(filePath, content);
|
||
}
|
||
|
||
// Nopea staattinen analyysi ennen Docker-ajoa
|
||
const pyFiles = Object.keys(files).filter(f => f.endsWith('.py'));
|
||
if (LANG === 'python' && pyFiles.length > 0) {
|
||
let syntaxErrors = '';
|
||
for (const f of pyFiles) {
|
||
try {
|
||
execSync(`python3 -c "import py_compile; py_compile.compile('${join(dir, f)}', doraise=True)"`, { timeout: 5000, encoding: 'utf-8', stdio: 'pipe' });
|
||
} catch (e) {
|
||
syntaxErrors += `${f}: ${(e.stderr || e.message || '').split('\n').filter(l => l.includes('Error')).join('; ')}\n`;
|
||
}
|
||
}
|
||
if (syntaxErrors) {
|
||
console.log(` [5/5] ⚠ Syntaksivirhe — ohitetaan Docker`);
|
||
writeFileSync(`${dir}/_testout_${testRound}.txt`, `SYNTAX ERRORS:\n${syntaxErrors}`);
|
||
Object.assign(result, { testsPassed: 0, testsFailed: 1, testsTotal: 1 });
|
||
|
||
if (testRound >= MAX_TEST_FIX) { result.error = 'Syntaksivirhe'; break; }
|
||
|
||
console.log(` [5/5] Itsekorjaus: syntaksi...`);
|
||
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
|
||
const fixPrompt = `Fix the following syntax errors. Return ALL files with === markers.\n\nERRORS:\n${syntaxErrors}\n\nCURRENT CODE:\n${allCode}`;
|
||
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 8192);
|
||
timings.push(fixResp);
|
||
const fixedFiles = parseGeneratedFiles(fixResp.text);
|
||
for (const [fn, content] of Object.entries(fixedFiles)) {
|
||
if (LCONF.required.includes(fn)) files[fn] = content;
|
||
}
|
||
testFixRounds++;
|
||
continue;
|
||
}
|
||
}
|
||
|
||
const roundLabel = testRound > 0 ? ` (korjaus ${testRound}/${MAX_TEST_FIX})` : '';
|
||
console.log(` [5/5] ${testLabel}${roundLabel}...`);
|
||
|
||
let testOut = '';
|
||
try {
|
||
testOut = execSync(
|
||
`docker run --rm -v "${dir}:/src:ro" -v kipina-cargo-registry:/usr/local/cargo/registry -v kipina-cargo-target:/work/target ${LCONF.dockerImage} 2>&1`,
|
||
{ timeout: dockerTimeout, encoding: 'utf-8' }
|
||
);
|
||
} catch (e) {
|
||
testOut = e.stdout || e.stderr || e.message || '';
|
||
}
|
||
writeFileSync(`${dir}/_testout_${testRound}.txt`, testOut);
|
||
const testResult = parseTestOutput(testOut);
|
||
Object.assign(result, testResult);
|
||
|
||
// Seuraa parasta tulosta — revert jos korjaus huononsi
|
||
if (result.testsPassed > bestPassed) {
|
||
bestPassed = result.testsPassed;
|
||
bestFiles = { ...files };
|
||
} else if (testRound > 0 && result.testsPassed < bestPassed) {
|
||
console.log(` [5/5] ⚠ Korjaus huononsi (${result.testsPassed}/${result.testsTotal} < ${bestPassed}) — palautetaan paras versio`);
|
||
files = { ...bestFiles };
|
||
Object.assign(result, { testsPassed: bestPassed });
|
||
break;
|
||
}
|
||
|
||
// Kaikki testit läpi → valmis
|
||
if (result.testsTotal > 0 && result.testsPassed === result.testsTotal) break;
|
||
|
||
// Viimeinen kierros tai ei enää korjausmahdollisuutta
|
||
if (testRound >= MAX_TEST_FIX) {
|
||
if (result.testsTotal === 0) result.error = 'Testit kaatuivat';
|
||
break;
|
||
}
|
||
|
||
// Itsekorjaus: syötä virhe + koodi mallille
|
||
const errorLines = testOut.split('\n').filter(l => /^E |FAILED|ERROR|error\[E|--- FAIL|panic:|\.go:\d+/.test(l)).slice(0, 20).join('\n');
|
||
if (!errorLines) break; // Ei parsittavia virheitä
|
||
|
||
console.log(` [5/5] Itsekorjaus: ${result.testsFailed || 'virhe'}...`);
|
||
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
|
||
const fixPrompt = `The following test errors occurred. Fix the code so ALL tests pass. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`;
|
||
const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, LANG === 'rust' ? 12288 : LANG === 'go' ? 10240 : 8192);
|
||
timings.push(fixResp);
|
||
|
||
const fixedFiles = parseGeneratedFiles(fixResp.text);
|
||
for (const [fn, content] of Object.entries(fixedFiles)) {
|
||
if (LCONF.required.includes(fn)) files[fn] = content;
|
||
}
|
||
testFixRounds++;
|
||
}
|
||
|
||
// Kirjoita paras versio levylle
|
||
for (const [fn, content] of Object.entries(bestPassed >= 0 ? bestFiles : files)) {
|
||
const filePath = join(dir, fn);
|
||
mkdirSync(dirname(filePath), { recursive: true });
|
||
writeFileSync(filePath, content);
|
||
}
|
||
// fixRounds = vain testikorjaukset (cargo check -korjaukset erilliset vaihe 4:ssä)
|
||
result.fixRounds = testFixRounds;
|
||
} catch (e) {
|
||
result.error = e.message;
|
||
}
|
||
|
||
// Yhteenveto
|
||
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
|
||
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
|
||
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
|
||
result.score = scoreResult(result);
|
||
result.stars = starsForScore(result.score);
|
||
result.profile = profile;
|
||
result.promptName = promptName;
|
||
|
||
return result;
|
||
}
|
||
|
||
// === GPU-muistin tyhjennys ===
|
||
async function clearVram() {
|
||
try {
|
||
const psResp = await fetch(`${OLLAMA_URL}/api/ps`);
|
||
const psData = await psResp.json();
|
||
for (const m of (psData.models || [])) {
|
||
await fetch(`${OLLAMA_URL}/api/generate`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({ model: m.name, keep_alive: 0 }),
|
||
});
|
||
console.log(` ♻ Vapautettu: ${m.name}`);
|
||
}
|
||
} catch (e) { /* ei kriittinen */ }
|
||
}
|
||
|
||
// === Main ===
|
||
async function main() {
|
||
console.log('╔══════════════════════════════════════════════╗');
|
||
console.log('║ Kipinä CodeBench ║');
|
||
console.log('╚══════════════════════════════════════════════╝');
|
||
console.log(`Ollama: ${OLLAMA_URL} 📝 ${LANG}${COMPACT_MODE ? ' (compact)' : ''}${THINK_MODE ? ' 🧠 thinking ON' : ''}`);
|
||
|
||
// Haetaan mallit
|
||
let models;
|
||
try {
|
||
models = await ollamaListModels();
|
||
} catch (e) {
|
||
console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
|
||
process.exit(1);
|
||
}
|
||
|
||
if (FILTER_MODELS) {
|
||
const filter = FILTER_MODELS.split(',').map(s => s.trim());
|
||
models = models.filter(m => filter.some(f => m.includes(f)));
|
||
}
|
||
|
||
console.log(`Mallit (${models.length}): ${models.join(', ')}`);
|
||
|
||
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS :
|
||
SCENARIOS.filter(s => s.id === SCENARIO_FILTER).length > 0 ? SCENARIOS.filter(s => s.id === SCENARIO_FILTER) :
|
||
[SCENARIOS[0]];
|
||
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
|
||
if (ROUNDS > 1) console.log(`Toistoja: ${ROUNDS}`);
|
||
console.log(`Tulokset: ${OUTPUT_DIR}/`);
|
||
console.log('');
|
||
|
||
// Puhdista output
|
||
rmSync(OUTPUT_DIR, { recursive: true, force: true });
|
||
mkdirSync(OUTPUT_DIR, { recursive: true });
|
||
|
||
const results = [];
|
||
|
||
for (let round = 1; round <= ROUNDS; round++) {
|
||
await clearVram();
|
||
if (ROUNDS > 1) console.log(`\n╔═══ Kierros ${round}/${ROUNDS} ═══╗`);
|
||
for (const model of models) {
|
||
for (const scenario of scenarios) {
|
||
const roundLabel = ROUNDS > 1 ? ` [${round}/${ROUNDS}]` : '';
|
||
console.log(`\n━━━ ${model} × ${scenario.id}${roundLabel} ━━━`);
|
||
const r = await runPipeline(model, scenario, round);
|
||
if (ROUNDS > 1) r.round = round;
|
||
results.push(r);
|
||
|
||
// Tallenna tulokset jokaisen kierroksen jälkeen
|
||
writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
|
||
|
||
const status = r.error ? `✗ ${r.error}` :
|
||
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
|
||
`◐ ${r.testsPassed}/${r.testsTotal}`;
|
||
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
|
||
console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
||
}
|
||
}
|
||
// Väliraportti kierroksen jälkeen
|
||
if (ROUNDS > 1) {
|
||
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
|
||
console.log(`\n┌─── Tilanne kierroksen ${round}/${ROUNDS} jälkeen ───┐`);
|
||
for (const model of [...new Set(results.map(r => r.model))]) {
|
||
const mrs = results.filter(r => r.model === model);
|
||
for (const sid of scenarios.map(s => s.id)) {
|
||
const runs = mrs.filter(r => r.scenario === sid);
|
||
if (runs.length === 0) continue;
|
||
const scores = runs.map(r => r.score);
|
||
const med = median(scores);
|
||
const last = scores[scores.length - 1];
|
||
const trend = scores.length > 1 ? (last > scores[scores.length - 2] ? '▲' : last < scores[scores.length - 2] ? '▼' : '─') : '';
|
||
console.log(`│ ${model.padEnd(28)} ${sid.padEnd(7)} ${starsForScore(med)} med:${String(med).padStart(3)}p [${scores.join(',')}] ${trend}`);
|
||
}
|
||
}
|
||
console.log(`└${'─'.repeat(45)}┘`);
|
||
}
|
||
} // rounds
|
||
|
||
// === Tulostaulu ===
|
||
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
|
||
console.log('║ TULOKSET ║');
|
||
console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
|
||
|
||
const header = [
|
||
'Malli'.padEnd(40),
|
||
'Skenaario'.padEnd(10),
|
||
'Speksi'.padEnd(8),
|
||
'Testit'.padEnd(10),
|
||
'Korjaus'.padEnd(8),
|
||
'Ctx'.padEnd(7),
|
||
'Aika'.padEnd(8),
|
||
'tok/s'.padEnd(8),
|
||
'Pisteet',
|
||
].join(' │ ');
|
||
console.log(`║ ${header} ║`);
|
||
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
|
||
|
||
for (const r of results) {
|
||
const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗';
|
||
const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
|
||
const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
|
||
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
|
||
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
||
const speed = `${r.avgTokPerSec.toFixed(0)}`;
|
||
const row = [
|
||
r.model.padEnd(40),
|
||
r.scenario.padEnd(10),
|
||
specStatus.padEnd(8),
|
||
testStatus.padEnd(10),
|
||
fixStatus.padEnd(8),
|
||
ctx.padEnd(7),
|
||
time.padEnd(8),
|
||
speed.padEnd(8),
|
||
`${r.stars} ${r.score}`,
|
||
].join(' │ ');
|
||
console.log(`║ ${row} ║`);
|
||
}
|
||
console.log('╚' + '═'.repeat(header.length + 2) + '╝');
|
||
|
||
// === Mallikohtainen yhteenveto ===
|
||
const modelNames = [...new Set(results.map(r => r.model))];
|
||
const scenarioIds = scenarios.map(s => s.id);
|
||
|
||
console.log('\n');
|
||
const mHeader = [
|
||
'Malli'.padEnd(35),
|
||
...scenarioIds.map(s => s.padEnd(22)),
|
||
'Yht.'.padEnd(8),
|
||
'Out'.padEnd(7),
|
||
'Aika'.padEnd(8),
|
||
'tok/s'.padEnd(7),
|
||
'Pisteet',
|
||
].join(' │ ');
|
||
console.log(mHeader);
|
||
console.log('─'.repeat(mHeader.length));
|
||
|
||
for (const model of modelNames) {
|
||
const mrs = results.filter(r => r.model === model);
|
||
const cols = scenarioIds.map(sid => {
|
||
const r = mrs.find(r => r.scenario === sid);
|
||
if (!r) return '-'.padEnd(22);
|
||
const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
|
||
const s = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
||
const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`;
|
||
return `${t} ${s} ${tok}`.padEnd(22);
|
||
});
|
||
const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0);
|
||
const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0);
|
||
const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0);
|
||
const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0);
|
||
const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0;
|
||
const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0;
|
||
const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0;
|
||
const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`;
|
||
const row = [
|
||
model.padEnd(35),
|
||
...cols,
|
||
`${totalPassed}/${totalTests}`.padEnd(8),
|
||
tokStr.padEnd(7),
|
||
`${(totalTime/1000).toFixed(0)}s`.padEnd(8),
|
||
`${avgSpeed}`.padEnd(7),
|
||
`${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`,
|
||
].join(' │ ');
|
||
console.log(row);
|
||
}
|
||
|
||
// Tallenna JSON + HTML-raportti
|
||
const jsonData = JSON.stringify(results, null, 2);
|
||
writeFileSync(`${OUTPUT_DIR}/results.json`, jsonData);
|
||
const templatePath = join(__dirname, 'report-template.html');
|
||
let htmlData = '';
|
||
if (existsSync(templatePath)) {
|
||
htmlData = readFileSync(templatePath, 'utf-8').replace('/*DATA_PLACEHOLDER*/[]', JSON.stringify(results));
|
||
writeFileSync(`${OUTPUT_DIR}/report.html`, htmlData);
|
||
console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`);
|
||
}
|
||
console.log(`JSON: ${OUTPUT_DIR}/results.json`);
|
||
|
||
// Kopioi results/-kansioon aikaleimalla
|
||
mkdirSync(RESULTS_DIR, { recursive: true });
|
||
writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.json`), jsonData);
|
||
if (htmlData) writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.html`), htmlData);
|
||
console.log(`Arkistoitu: results/${TIMESTAMP}.json`);
|
||
|
||
// Yhteenveto
|
||
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
|
||
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
|
||
const failed = results.filter(r => r.error || r.testsTotal === 0);
|
||
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
|
||
const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
|
||
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
|
||
|
||
// === Kierrosyhteenveto (kun rounds > 1) ===
|
||
if (ROUNDS > 1) {
|
||
console.log('\n\n╔══════════════════════════════════════════════╗');
|
||
console.log('║ KIERROSYHTEENVETO (mediaani) ║');
|
||
console.log('╚══════════════════════════════════════════════╝\n');
|
||
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
|
||
|
||
for (const model of modelNames) {
|
||
const mrs = results.filter(r => r.model === model);
|
||
for (const sid of scenarioIds) {
|
||
const runs = mrs.filter(r => r.scenario === sid);
|
||
if (runs.length === 0) continue;
|
||
const scores = runs.map(r => r.score);
|
||
const med = median(scores);
|
||
const min = Math.min(...scores);
|
||
const max = Math.max(...scores);
|
||
const passRates = runs.map(r => r.testsTotal > 0 ? Math.round(r.testsPassed/r.testsTotal*100) : 0);
|
||
console.log(`${model.padEnd(30)} ${sid.padEnd(8)} ${starsForScore(med)} med:${med}p min:${min} max:${max} pass:[${passRates.join(',')}]%`);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
main().catch(e => { console.error(e); process.exit(1); });
|