Files
agentic-studio/kipina-codebench/benchmark.mjs
jaakko bb8be3ffb4 CodeBench: revert-if-worse + erillinen testFixRounds-laskuri
- Seurataan parasta testitulosta (bestPassed/bestFiles)
- Jos korjaus huonontaa: palautetaan paras versio ja lopetetaan
- fixRounds laskee vain testikorjaukset, ei cargo check -kierroksia
- Estää 4/7 → 0/1 regressiot korjaussilmukassa
2026-04-14 18:24:46 +03:00

813 lines
40 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* Kipinä CodeBench — LLM-koodingenerointibenchmark
*
* Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa.
*
* Käyttö:
* node benchmark.mjs # kaikki mallit, oletusskenaario
* node benchmark.mjs --models qwen3-coder:30b # yksi malli
* node benchmark.mjs --ollama http://host:11434 # eri Ollama
* node benchmark.mjs --scenarios all # kaikki skenaariot
* node benchmark.mjs --output ./results/run-001 # custom output-hakemisto
*/
import { execSync } from 'child_process';
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
// === CLI-argumentit ===
const args = process.argv.slice(2);
function arg(name, fallback) {
const i = args.indexOf(`--${name}`);
return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
}
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://127.0.0.1:11434');
const HUB_URL = arg('hub', '');
const FILTER_MODELS = arg('models', '');
const SCENARIO_FILTER = arg('scenarios', 'default');
const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 16);
const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`);
const RESULTS_DIR = join(__dirname, 'results');
const THINK_MODE = args.includes('--think');
const COMPACT_MODE = args.includes('--compact');
const LANG = arg('lang', 'python'); // python | rust
const ROUNDS = parseInt(arg('rounds', '1')); // 1-10 toistoa
const MAX_FIX_ROUNDS = 2;
// === Promptien lataus tiedostoista ===
function loadPrompt(name) {
const path = join(__dirname, 'prompts', `${name}.md`);
if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`);
return readFileSync(path, 'utf-8').trim();
}
const CLIENT_SYSTEM = loadPrompt('client');
const SPEC_SYSTEM = loadPrompt('spec');
const FIX_SYSTEM = loadPrompt('fix');
// === Mallikohtaiset profiilit ===
const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8'));
function getGoldenForModel(model) {
const modelConf = PROFILES.models[model];
const goldenFile = modelConf?.golden || 'todo.md';
// Rust-kielelle vaihda .md → -rs.md (todo.md → todo-rs.md)
if (LANG === 'rust') {
const rsFile = goldenFile.replace(/\.md$/, '-rs.md');
const rsPath = join(GOLDEN_DIR, rsFile);
if (existsSync(rsPath)) return rsFile;
}
return goldenFile;
}
function getCodePromptForModel(model) {
const modelConf = PROFILES.models[model];
const profile = modelConf?.profile || PROFILES.default_profile;
const promptName = modelConf?.prompt || PROFILES.profiles[profile]?.prompt || 'code';
const suffix = LANG === 'rust' ? '-rs' : '';
// Yritä kielispesifistä ensin (code-small-rs), sitten perus (code-small)
const candidates = [`${promptName}${suffix}`, promptName, `code${suffix}`, 'code'];
for (const name of candidates) {
const path = join(__dirname, 'prompts', `${name}.md`);
if (existsSync(path)) return { system: readFileSync(path, 'utf-8').trim(), promptName: name, profile };
}
return { system: loadPrompt('code'), promptName: 'code', profile: 'large' };
}
// === Kultaisten esimerkkien lataus (kielen mukaan) ===
const GOLDEN_DIR = join(__dirname, 'golden-examples');
const LANG_CONFIG = {
python: {
goldenDir: 'todo',
files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
dockerImage: 'kipina-pytest',
},
rust: {
goldenDir: 'todo-rs',
files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
dockerImage: 'kipina-cargo-test',
},
};
const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python;
function loadGoldenExample(model) {
// --compact: käytä tiivistettyä templaattia
if (COMPACT_MODE) {
const compactFile = LANG === 'rust' ? 'golden-compact-rs.md' : 'golden-compact-py.md';
const compactPath = join(__dirname, 'prompts', compactFile);
if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n';
}
// Mallikohtainen golden example profiilista
const goldenFile = model ? getGoldenForModel(model) : (LANG === 'rust' ? 'todo-rs.md' : 'todo.md');
const mdPath = join(GOLDEN_DIR, goldenFile);
if (existsSync(mdPath)) return '\n' + readFileSync(mdPath, 'utf-8').trim() + '\n';
// Fallback: erilliset tiedostot
const todoDir = join(GOLDEN_DIR, LCONF.goldenDir);
if (!existsSync(todoDir)) return '';
let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`;
for (const f of LCONF.files) {
const path = join(todoDir, f);
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
}
return example;
}
// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
function stripThinking(text) {
return text
.replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '') // gemma4
.replace(/<think>[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5
.trim();
}
// === Ollama / Hub -client ===
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
const start = Date.now();
if (HUB_URL) {
const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
});
if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
const data = await resp.json();
const elapsed = Date.now() - start;
return {
text: stripThinking((data.response || '').trim()),
tokens: data.tokens_generated || 0,
durationMs: elapsed,
tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
};
}
// Suora Ollama-reitti
const messages = [];
if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
messages.push({ role: 'user', content: prompt });
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model,
messages,
stream: false,
think: THINK_MODE,
options: { num_predict: THINK_MODE ? maxTokens * 3 : maxTokens, num_ctx: 16384, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
}),
});
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
const data = await resp.json();
const elapsed = Date.now() - start;
const rawContent = (data.message?.content || '').trim();
const thinking = (data.message?.thinking || '').trim();
const text = stripThinking(rawContent || thinking);
const evalCount = data.eval_count || 0;
if (!rawContent && thinking) console.log(` ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`);
const evalDurationNs = data.eval_duration || 1;
const tokPerSec = evalCount / (evalDurationNs / 1e9);
return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
}
async function ollamaListModels() {
const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
const resp = await fetch(url);
if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
const data = await resp.json();
return (data.models || []).map(m => m.name);
}
// === Testitulosten parsinta (pytest + cargo test) ===
function parseTestOutput(output) {
// Pytest: "6 passed", "2 failed", "1 error"
const pyPassed = output.match(/(\d+) passed/);
const pyFailed = output.match(/(\d+) failed/);
const pyError = output.match(/(\d+) error/);
if (pyPassed || pyFailed) {
const passed = pyPassed ? parseInt(pyPassed[1]) : 0;
const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0);
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
}
// Cargo test: "test result: ok. 10 passed; 0 failed;"
const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/);
if (cargoMatch) {
const passed = parseInt(cargoMatch[1]);
const failed = parseInt(cargoMatch[2]);
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
}
// Cargo compilation error: count "error[E" occurrences
const compileErrors = (output.match(/error\[E\d+\]/g) || []).length;
if (compileErrors > 0) {
return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors };
}
return { testsPassed: 0, testsFailed: 0, testsTotal: 0 };
}
// === Tiedostoparseri LLM-vastauksesta ===
function parseGeneratedFiles(text) {
const files = {};
const sections = text.split(/===\s*(\S+\.(?:py|toml|rs))\s*===/);
for (let i = 1; i < sections.length - 1; i += 2) {
const name = sections[i];
let content = sections[i + 1].trim();
content = content.replace(/^```(?:python|toml|rust)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
if (content) files[name] = content + '\n';
}
return files;
}
// === Validaattori ===
function validateProjectCode(files) {
const issues = [];
for (const [fname, code] of Object.entries(files)) {
if (!fname.endsWith('.py')) continue;
const lines = code.split('\n');
for (const line of lines) {
if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`);
}
for (const line of lines) {
const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
if (!m) continue;
const srcCode = files[m[1] + '.py'];
if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
for (const name of names) {
if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
}
}
if (fname === 'schemas.py') {
if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
issues.push('ISSUE: schemas.py: date-import puuttuu');
if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
issues.push('ISSUE: schemas.py: datetime-import puuttuu');
}
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
}
}
return issues;
}
function extractJson(text) {
const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
if (m) text = m[1].trim();
let depth = 0, start = null;
for (let i = 0; i < text.length; i++) {
if (text[i] === '{') { if (depth === 0) start = i; depth++; }
else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
}
return null;
}
// === Testiskenaariot ===
const SCENARIOS = [
{ id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
{ id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
{ id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
];
// === Pisteytys (0100) ja tähtiluokitus ===
function scoreResult(r) {
if (r.error && r.testsTotal === 0) return 0;
let score = 0;
if (r.specOk) score += 10;
if (!r.error || r.testsTotal > 0) score += 10;
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
score += Math.max(0, 20 - r.fixRounds * 10);
return Math.min(100, score);
}
function starsForScore(score) {
if (score >= 90) return '★★★★★';
if (score >= 70) return '★★★★☆';
if (score >= 50) return '★★★☆☆';
if (score >= 25) return '★★☆☆☆';
if (score > 0) return '★☆☆☆☆';
return '☆☆☆☆☆';
}
// === Pipeline: yhdelle mallille ja skenaariolle ===
async function runPipeline(model, scenario) {
const result = {
model, scenario: scenario.id,
reqOk: false, specOk: false, specEntities: 0,
validationIssues: 0, fixRounds: 0,
testsTotal: 0, testsPassed: 0, testsFailed: 0,
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
promptChars: 0, promptTokensEst: 0,
score: 0, stars: '',
error: null,
};
const timings = [];
const { system: CODE_SYSTEM, promptName, profile } = getCodePromptForModel(model);
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
mkdirSync(dir, { recursive: true });
try {
// 1. Vaatimukset
console.log(` [1/5] Vaatimukset...`);
const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 2048);
timings.push(req);
if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
result.reqOk = true;
writeFileSync(`${dir}/_requirements.txt`, req.text);
// 2. JSON-speksi
console.log(` [2/5] JSON-speksi...`);
const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 4096);
timings.push(specResp);
const spec = extractJson(specResp.text);
if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
result.specOk = true;
result.specEntities = spec.entities.length;
writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
// 3. LLM-koodigenerointi
const fileCount = LCONF.required.length;
const goldenExample = loadGoldenExample(model);
const codeTokens = LANG === 'rust' ? 12288 : 8192;
let files;
// Orkestrointi: pilko entiteetti kerrallaan pienille malleille
if (profile === 'small' && spec.entities.length > 1 && LANG === 'python') {
console.log(` [3/5] Koodigenerointi (orkestroitu, ${spec.entities.length} entiteettiä)...`);
files = {};
let cumulativeCode = '';
for (let ei = 0; ei < spec.entities.length; ei++) {
const entity = spec.entities[ei];
const isFirst = ei === 0;
const entitySpec = {
...spec,
entities: spec.entities.slice(0, ei + 1),
relationships: (spec.relationships || []).filter(r =>
spec.entities.slice(0, ei + 1).some(e => e.name === r.from)
),
};
let entityPrompt;
if (isFirst) {
entityPrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(entitySpec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files for the entity "${entity.name}". Follow the reference implementation patterns exactly.`;
} else {
entityPrompt = `${goldenExample}\n---\n\nEXISTING CODE (do not regenerate, only add to it):\n${cumulativeCode}\n\n---\n\nJSON SPECIFICATION (add entity "${entity.name}"):\n${JSON.stringify(entitySpec, null, 2)}\n\nAdd the entity "${entity.name}" to the existing code. Return ALL ${fileCount} files with === markers, including the existing entities. Follow the same patterns.`;
}
console.log(` [3/5] → ${entity.name}${isFirst ? '' : ' (+ ' + spec.entities.slice(0, ei).map(e => e.name).join(', ') + ')'}...`);
const entityResp = await ollamaChat(model, entityPrompt, CODE_SYSTEM, codeTokens);
timings.push(entityResp);
const entityFiles = parseGeneratedFiles(entityResp.text);
// Yhdistä — uudempi korvaa edellisen
for (const [fn, content] of Object.entries(entityFiles)) {
files[fn] = content;
}
cumulativeCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
}
writeFileSync(`${dir}/_code_raw.txt`, cumulativeCode);
result.promptChars = CODE_SYSTEM.length + cumulativeCode.length;
result.promptTokensEst = Math.round(result.promptChars / 4);
} else {
// Normaali: kaikki kerralla
console.log(` [3/5] Koodigenerointi (LLM)...`);
const codePrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`;
result.promptChars = CODE_SYSTEM.length + codePrompt.length;
result.promptTokensEst = Math.round(result.promptChars / 4);
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens);
timings.push(codeResp);
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
files = parseGeneratedFiles(codeResp.text);
}
const missing = LCONF.required.filter(f => !files[f]);
if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
// 4. Validointi + korjaussilmukka
let fixRound = 0;
if (LANG === 'rust') {
// Rust: cargo check Docker-kontissa ennen testejä
for (let checkRound = 0; checkRound < MAX_FIX_ROUNDS; checkRound++) {
// Kirjoita tiedostot levylle
for (const [fn, content] of Object.entries(files)) {
const filePath = join(dir, fn);
mkdirSync(dirname(filePath), { recursive: true });
writeFileSync(filePath, content);
}
console.log(` [4/5] Cargo check${checkRound > 0 ? ` (korjaus ${checkRound})` : ''}...`);
let checkOut = '';
try {
checkOut = execSync(
`docker run --rm --entrypoint sh -v "${dir}:/src:ro" ${LCONF.dockerImage} -c "cp -r /src/* . && cargo check 2>&1"`,
{ timeout: 300000, encoding: 'utf-8' }
);
} catch (e) {
checkOut = e.stdout || e.stderr || e.message || '';
}
const compileErrors = checkOut.split('\n').filter(l => /^error/.test(l));
if (compileErrors.length === 0) break; // Kääntyy — jatka testeihin
console.log(` [4/5] ${compileErrors.length} käännösvirhettä — korjataan...`);
fixRound++;
const errorLines = checkOut.split('\n').filter(l => /^error|^\s+-->/.test(l)).slice(0, 30).join('\n');
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
const fixPrompt = `Fix the following Rust compilation errors. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`;
const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, 12288);
timings.push(fixResp);
const fixedFiles = parseGeneratedFiles(fixResp.text);
for (const [fn, content] of Object.entries(fixedFiles)) {
if (LCONF.required.includes(fn)) files[fn] = content;
}
}
}
if (LANG === 'python') {
let issues = validateProjectCode(files);
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
fixRound++;
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
const issuesByFile = {};
for (const issue of issues) {
const m = issue.match(/^ISSUE:\s*(\S+?):/);
const fname = m ? m[1] : 'unknown';
if (!issuesByFile[fname]) issuesByFile[fname] = [];
issuesByFile[fname].push(issue);
}
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
if (!files[fname]) continue;
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
timings.push(fixResp);
if (fixResp.text) {
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
}
}
issues = validateProjectCode(files);
}
result.validationIssues = issues.length;
}
result.fixRounds = fixRound;
// 5. Testit Docker-kontissa + itsekorjaava looppi (Taso 4)
const testLabel = LANG === 'rust' ? 'Cargo test' : 'Pytest';
const dockerTimeout = LANG === 'rust' ? 300000 : 120000;
const MAX_TEST_FIX = 3;
let bestFiles = { ...files }; // Paras versio tiedostoista
let bestPassed = -1; // Paras testitulos
let testFixRounds = 0; // Erillinen laskuri testikorjauksille
for (let testRound = 0; testRound <= MAX_TEST_FIX; testRound++) {
// Kirjoita tiedostot levylle
for (const [fn, content] of Object.entries(files)) {
const filePath = join(dir, fn);
mkdirSync(dirname(filePath), { recursive: true });
writeFileSync(filePath, content);
}
// Nopea staattinen analyysi ennen Docker-ajoa
const pyFiles = Object.keys(files).filter(f => f.endsWith('.py'));
if (LANG === 'python' && pyFiles.length > 0) {
let syntaxErrors = '';
for (const f of pyFiles) {
try {
execSync(`python3 -c "import py_compile; py_compile.compile('${join(dir, f)}', doraise=True)"`, { timeout: 5000, encoding: 'utf-8', stdio: 'pipe' });
} catch (e) {
syntaxErrors += `${f}: ${(e.stderr || e.message || '').split('\n').filter(l => l.includes('Error')).join('; ')}\n`;
}
}
if (syntaxErrors) {
console.log(` [5/5] ⚠ Syntaksivirhe — ohitetaan Docker`);
writeFileSync(`${dir}/_testout_${testRound}.txt`, `SYNTAX ERRORS:\n${syntaxErrors}`);
Object.assign(result, { testsPassed: 0, testsFailed: 1, testsTotal: 1 });
if (testRound >= MAX_TEST_FIX) { result.error = 'Syntaksivirhe'; break; }
console.log(` [5/5] Itsekorjaus: syntaksi...`);
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
const fixPrompt = `Fix the following syntax errors. Return ALL files with === markers.\n\nERRORS:\n${syntaxErrors}\n\nCURRENT CODE:\n${allCode}`;
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 8192);
timings.push(fixResp);
const fixedFiles = parseGeneratedFiles(fixResp.text);
for (const [fn, content] of Object.entries(fixedFiles)) {
if (LCONF.required.includes(fn)) files[fn] = content;
}
testFixRounds++;
continue;
}
}
const roundLabel = testRound > 0 ? ` (korjaus ${testRound}/${MAX_TEST_FIX})` : '';
console.log(` [5/5] ${testLabel}${roundLabel}...`);
let testOut = '';
try {
testOut = execSync(
`docker run --rm -v "${dir}:/src:ro" ${LCONF.dockerImage} 2>&1`,
{ timeout: dockerTimeout, encoding: 'utf-8' }
);
} catch (e) {
testOut = e.stdout || e.stderr || e.message || '';
}
writeFileSync(`${dir}/_testout_${testRound}.txt`, testOut);
const testResult = parseTestOutput(testOut);
Object.assign(result, testResult);
// Seuraa parasta tulosta — revert jos korjaus huononsi
if (result.testsPassed > bestPassed) {
bestPassed = result.testsPassed;
bestFiles = { ...files };
} else if (testRound > 0 && result.testsPassed < bestPassed) {
console.log(` [5/5] ⚠ Korjaus huononsi (${result.testsPassed}/${result.testsTotal} < ${bestPassed}) — palautetaan paras versio`);
files = { ...bestFiles };
Object.assign(result, { testsPassed: bestPassed });
break;
}
// Kaikki testit läpi → valmis
if (result.testsTotal > 0 && result.testsPassed === result.testsTotal) break;
// Viimeinen kierros tai ei enää korjausmahdollisuutta
if (testRound >= MAX_TEST_FIX) {
if (result.testsTotal === 0) result.error = 'Testit kaatuivat';
break;
}
// Itsekorjaus: syötä virhe + koodi mallille
const errorLines = testOut.split('\n').filter(l => /^E |FAILED|ERROR|error\[E/.test(l)).slice(0, 20).join('\n');
if (!errorLines) break; // Ei parsittavia virheitä
console.log(` [5/5] Itsekorjaus: ${result.testsFailed || 'virhe'}...`);
const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n');
const fixPrompt = `The following test errors occurred. Fix the code so ALL tests pass. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`;
const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, LANG === 'rust' ? 12288 : 8192);
timings.push(fixResp);
const fixedFiles = parseGeneratedFiles(fixResp.text);
for (const [fn, content] of Object.entries(fixedFiles)) {
if (LCONF.required.includes(fn)) files[fn] = content;
}
testFixRounds++;
}
// Kirjoita paras versio levylle
for (const [fn, content] of Object.entries(bestPassed >= 0 ? bestFiles : files)) {
const filePath = join(dir, fn);
mkdirSync(dirname(filePath), { recursive: true });
writeFileSync(filePath, content);
}
// fixRounds = vain testikorjaukset (cargo check -korjaukset erilliset vaihe 4:ssä)
result.fixRounds = testFixRounds;
} catch (e) {
result.error = e.message;
}
// Yhteenveto
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
result.score = scoreResult(result);
result.stars = starsForScore(result.score);
result.profile = profile;
result.promptName = promptName;
return result;
}
// === GPU-muistin tyhjennys ===
async function clearVram() {
try {
const psResp = await fetch(`${OLLAMA_URL}/api/ps`);
const psData = await psResp.json();
for (const m of (psData.models || [])) {
await fetch(`${OLLAMA_URL}/api/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model: m.name, keep_alive: 0 }),
});
console.log(` ♻ Vapautettu: ${m.name}`);
}
} catch (e) { /* ei kriittinen */ }
}
// === Main ===
async function main() {
console.log('╔══════════════════════════════════════════════╗');
console.log('║ Kipinä CodeBench ║');
console.log('╚══════════════════════════════════════════════╝');
console.log(`Ollama: ${OLLAMA_URL} 📝 ${LANG}${COMPACT_MODE ? ' (compact)' : ''}${THINK_MODE ? ' 🧠 thinking ON' : ''}`);
// Haetaan mallit
let models;
try {
models = await ollamaListModels();
} catch (e) {
console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
process.exit(1);
}
if (FILTER_MODELS) {
const filter = FILTER_MODELS.split(',').map(s => s.trim());
models = models.filter(m => filter.some(f => m.includes(f)));
}
console.log(`Mallit (${models.length}): ${models.join(', ')}`);
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS :
SCENARIOS.filter(s => s.id === SCENARIO_FILTER).length > 0 ? SCENARIOS.filter(s => s.id === SCENARIO_FILTER) :
[SCENARIOS[0]];
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
if (ROUNDS > 1) console.log(`Toistoja: ${ROUNDS}`);
console.log(`Tulokset: ${OUTPUT_DIR}/`);
console.log('');
// Puhdista output
rmSync(OUTPUT_DIR, { recursive: true, force: true });
mkdirSync(OUTPUT_DIR, { recursive: true });
const results = [];
for (let round = 1; round <= ROUNDS; round++) {
await clearVram();
if (ROUNDS > 1) console.log(`\n╔═══ Kierros ${round}/${ROUNDS} ═══╗`);
for (const model of models) {
for (const scenario of scenarios) {
const roundLabel = ROUNDS > 1 ? ` [${round}/${ROUNDS}]` : '';
console.log(`\n━━━ ${model} × ${scenario.id}${roundLabel} ━━━`);
const r = await runPipeline(model, scenario);
if (ROUNDS > 1) r.round = round;
results.push(r);
// Tallenna tulokset jokaisen kierroksen jälkeen
writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
const status = r.error ? `${r.error}` :
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` :
`${r.testsPassed}/${r.testsTotal}`;
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
console.log(`${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
}
}
// Väliraportti kierroksen jälkeen
if (ROUNDS > 1) {
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
console.log(`\n┌─── Tilanne kierroksen ${round}/${ROUNDS} jälkeen ───┐`);
for (const model of [...new Set(results.map(r => r.model))]) {
const mrs = results.filter(r => r.model === model);
for (const sid of scenarios.map(s => s.id)) {
const runs = mrs.filter(r => r.scenario === sid);
if (runs.length === 0) continue;
const scores = runs.map(r => r.score);
const med = median(scores);
const last = scores[scores.length - 1];
const trend = scores.length > 1 ? (last > scores[scores.length - 2] ? '▲' : last < scores[scores.length - 2] ? '▼' : '─') : '';
console.log(`${model.padEnd(28)} ${sid.padEnd(7)} ${starsForScore(med)} med:${String(med).padStart(3)}p [${scores.join(',')}] ${trend}`);
}
}
console.log(`${'─'.repeat(45)}`);
}
} // rounds
// === Tulostaulu ===
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
console.log('║ TULOKSET ║');
console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
const header = [
'Malli'.padEnd(40),
'Skenaario'.padEnd(10),
'Speksi'.padEnd(8),
'Testit'.padEnd(10),
'Korjaus'.padEnd(8),
'Ctx'.padEnd(7),
'Aika'.padEnd(8),
'tok/s'.padEnd(8),
'Pisteet',
].join(' │ ');
console.log(`${header}`);
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
for (const r of results) {
const specStatus = r.specOk ? `${r.specEntities}e` : '✗';
const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
const speed = `${r.avgTokPerSec.toFixed(0)}`;
const row = [
r.model.padEnd(40),
r.scenario.padEnd(10),
specStatus.padEnd(8),
testStatus.padEnd(10),
fixStatus.padEnd(8),
ctx.padEnd(7),
time.padEnd(8),
speed.padEnd(8),
`${r.stars} ${r.score}`,
].join(' │ ');
console.log(`${row}`);
}
console.log('╚' + '═'.repeat(header.length + 2) + '╝');
// === Mallikohtainen yhteenveto ===
const modelNames = [...new Set(results.map(r => r.model))];
const scenarioIds = scenarios.map(s => s.id);
console.log('\n');
const mHeader = [
'Malli'.padEnd(35),
...scenarioIds.map(s => s.padEnd(22)),
'Yht.'.padEnd(8),
'Out'.padEnd(7),
'Aika'.padEnd(8),
'tok/s'.padEnd(7),
'Pisteet',
].join(' │ ');
console.log(mHeader);
console.log('─'.repeat(mHeader.length));
for (const model of modelNames) {
const mrs = results.filter(r => r.model === model);
const cols = scenarioIds.map(sid => {
const r = mrs.find(r => r.scenario === sid);
if (!r) return '-'.padEnd(22);
const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
const s = `${(r.totalDurationMs/1000).toFixed(0)}s`;
const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`;
return `${t} ${s} ${tok}`.padEnd(22);
});
const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0);
const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0);
const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0);
const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0);
const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0;
const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0;
const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0;
const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`;
const row = [
model.padEnd(35),
...cols,
`${totalPassed}/${totalTests}`.padEnd(8),
tokStr.padEnd(7),
`${(totalTime/1000).toFixed(0)}s`.padEnd(8),
`${avgSpeed}`.padEnd(7),
`${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`,
].join(' │ ');
console.log(row);
}
// Tallenna JSON + HTML-raportti
const jsonData = JSON.stringify(results, null, 2);
writeFileSync(`${OUTPUT_DIR}/results.json`, jsonData);
const templatePath = join(__dirname, 'report-template.html');
let htmlData = '';
if (existsSync(templatePath)) {
htmlData = readFileSync(templatePath, 'utf-8').replace('/*DATA_PLACEHOLDER*/[]', JSON.stringify(results));
writeFileSync(`${OUTPUT_DIR}/report.html`, htmlData);
console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`);
}
console.log(`JSON: ${OUTPUT_DIR}/results.json`);
// Kopioi results/-kansioon aikaleimalla
mkdirSync(RESULTS_DIR, { recursive: true });
writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.json`), jsonData);
if (htmlData) writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.html`), htmlData);
console.log(`Arkistoitu: results/${TIMESTAMP}.json`);
// Yhteenveto
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
const failed = results.filter(r => r.error || r.testsTotal === 0);
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
// === Kierrosyhteenveto (kun rounds > 1) ===
if (ROUNDS > 1) {
console.log('\n\n╔══════════════════════════════════════════════╗');
console.log('║ KIERROSYHTEENVETO (mediaani) ║');
console.log('╚══════════════════════════════════════════════╝\n');
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
for (const model of modelNames) {
const mrs = results.filter(r => r.model === model);
for (const sid of scenarioIds) {
const runs = mrs.filter(r => r.scenario === sid);
if (runs.length === 0) continue;
const scores = runs.map(r => r.score);
const med = median(scores);
const min = Math.min(...scores);
const max = Math.max(...scores);
const passRates = runs.map(r => r.testsTotal > 0 ? Math.round(r.testsPassed/r.testsTotal*100) : 0);
console.log(`${model.padEnd(30)} ${sid.padEnd(8)} ${starsForScore(med)} med:${med}p min:${min} max:${max} pass:[${passRates.join(',')}]%`);
}
}
}
}
main().catch(e => { console.error(e); process.exit(1); });