Siirrä kipina-codebench projektin päätasolle

This commit is contained in:
2026-04-14 09:44:14 +03:00
parent b93ae2fd1b
commit 7b27800390
24 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,490 @@
#!/usr/bin/env node
/**
* Kipinä CodeBench — LLM-koodingenerointibenchmark
*
* Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa.
*
* Käyttö:
* node benchmark.mjs # kaikki mallit, oletusskenaario
* node benchmark.mjs --models qwen3-coder:30b # yksi malli
* node benchmark.mjs --ollama http://host:11434 # eri Ollama
* node benchmark.mjs --scenarios all # kaikki skenaariot
* node benchmark.mjs --output ./results/run-001 # custom output-hakemisto
*/
import { execSync } from 'child_process';
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync, readdirSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
// === CLI-argumentit ===
const args = process.argv.slice(2);
function arg(name, fallback) {
const i = args.indexOf(`--${name}`);
return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
}
const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434');
const HUB_URL = arg('hub', '');
const FILTER_MODELS = arg('models', '');
const SCENARIO_FILTER = arg('scenarios', 'default');
const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark');
const MAX_FIX_ROUNDS = 2;
// === Promptien lataus tiedostoista ===
function loadPrompt(name) {
const path = join(__dirname, 'prompts', `${name}.md`);
if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`);
return readFileSync(path, 'utf-8').trim();
}
const CLIENT_SYSTEM = loadPrompt('client');
const SPEC_SYSTEM = loadPrompt('spec');
const CODE_SYSTEM = loadPrompt('code');
const FIX_SYSTEM = loadPrompt('fix');
// === Kultaisten esimerkkien lataus ===
const GOLDEN_DIR = join(__dirname, 'golden-examples');
const GOLDEN_PY_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
function loadGoldenExample() {
const todoDir = join(GOLDEN_DIR, 'todo');
if (!existsSync(todoDir)) return '';
let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n';
for (const f of GOLDEN_PY_FILES) {
const path = join(todoDir, f);
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
}
return example;
}
const GOLDEN_EXAMPLE = loadGoldenExample();
// === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
function stripThinking(text) {
return text
.replace(/<\|channel>thought[\s\S]*?<channel\|>/g, '') // gemma4
.replace(/<think>[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5
.trim();
}
// === Ollama / Hub -client ===
async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) {
const start = Date.now();
if (HUB_URL) {
const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`;
const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }),
});
if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`);
const data = await resp.json();
const elapsed = Date.now() - start;
return {
text: stripThinking((data.response || '').trim()),
tokens: data.tokens_generated || 0,
durationMs: elapsed,
tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000),
};
}
// Suora Ollama-reitti
const messages = [];
if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
messages.push({ role: 'user', content: prompt });
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model,
messages,
stream: false,
think: false,
options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 },
}),
});
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`);
const data = await resp.json();
const elapsed = Date.now() - start;
const rawContent = (data.message?.content || '').trim();
const thinking = (data.message?.thinking || '').trim();
const text = stripThinking(rawContent || thinking);
const evalCount = data.eval_count || 0;
if (!rawContent && thinking) console.log(` ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`);
const evalDurationNs = data.eval_duration || 1;
const tokPerSec = evalCount / (evalDurationNs / 1e9);
return { text, tokens: evalCount, durationMs: elapsed, tokPerSec };
}
async function ollamaListModels() {
const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`;
const resp = await fetch(url);
if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`);
const data = await resp.json();
return (data.models || []).map(m => m.name);
}
// === Tiedostoparseri LLM-vastauksesta ===
function parseGeneratedFiles(text) {
const files = {};
const sections = text.split(/===\s*(\S+\.(?:py|toml))\s*===/);
for (let i = 1; i < sections.length - 1; i += 2) {
const name = sections[i];
let content = sections[i + 1].trim();
content = content.replace(/^```(?:python|toml)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
if (content) files[name] = content + '\n';
}
return files;
}
// === Validaattori ===
function validateProjectCode(files) {
const issues = [];
for (const [fname, code] of Object.entries(files)) {
if (!fname.endsWith('.py')) continue;
const lines = code.split('\n');
for (const line of lines) {
if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`);
}
for (const line of lines) {
const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/);
if (!m) continue;
const srcCode = files[m[1] + '.py'];
if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; }
const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim());
for (const name of names) {
if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`);
}
}
if (fname === 'schemas.py') {
if (/:\s*date\b/.test(code) && !/from datetime import/.test(code))
issues.push('ISSUE: schemas.py: date-import puuttuu');
if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code))
issues.push('ISSUE: schemas.py: datetime-import puuttuu');
}
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue;
if (/(?<!["\w])false(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "false" → "False"`);
if (/(?<!["\w])true(?![\w"])/.test(line)) issues.push(`ISSUE: ${fname}:${i+1}: "true" → "True"`);
}
}
return issues;
}
function extractJson(text) {
const m = text.match(/```(?:json)?\s*\n([\s\S]*?)```/);
if (m) text = m[1].trim();
let depth = 0, start = null;
for (let i = 0; i < text.length; i++) {
if (text[i] === '{') { if (depth === 0) start = i; depth++; }
else if (text[i] === '}') { depth--; if (depth === 0 && start !== null) { try { return JSON.parse(text.slice(start, i+1)); } catch(e) { continue; } } }
}
return null;
}
// === Testiskenaariot ===
const SCENARIOS = [
{ id: 'todo', prompt: 'Todo-sovellus: tehtävien hallinta, deadline, prioriteetti ja status' },
{ id: 'users', prompt: 'REST API käyttäjähallinnalle SQLite-tietokannalla' },
{ id: 'blog', prompt: 'Blogi-API: kirjoittajat ja artikkelit, julkaisupäivämäärä ja status' },
];
// === Pisteytys (0100) ja tähtiluokitus ===
function scoreResult(r) {
if (r.error && r.testsTotal === 0) return 0;
let score = 0;
if (r.specOk) score += 10;
if (!r.error || r.testsTotal > 0) score += 10;
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
score += Math.max(0, 20 - r.fixRounds * 10);
return Math.min(100, score);
}
function starsForScore(score) {
if (score >= 90) return '★★★★★';
if (score >= 70) return '★★★★☆';
if (score >= 50) return '★★★☆☆';
if (score >= 25) return '★★☆☆☆';
if (score > 0) return '★☆☆☆☆';
return '☆☆☆☆☆';
}
// === Pipeline: yhdelle mallille ja skenaariolle ===
async function runPipeline(model, scenario) {
const result = {
model, scenario: scenario.id,
reqOk: false, specOk: false, specEntities: 0,
validationIssues: 0, fixRounds: 0,
testsTotal: 0, testsPassed: 0, testsFailed: 0,
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
promptChars: 0, promptTokensEst: 0,
score: 0, stars: '',
error: null,
};
const timings = [];
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
mkdirSync(dir, { recursive: true });
try {
// 1. Vaatimukset
console.log(` [1/5] Vaatimukset...`);
const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 2048);
timings.push(req);
if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; }
result.reqOk = true;
writeFileSync(`${dir}/_requirements.txt`, req.text);
// 2. JSON-speksi
console.log(` [2/5] JSON-speksi...`);
const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 4096);
timings.push(specResp);
const spec = extractJson(specResp.text);
if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; }
result.specOk = true;
result.specEntities = spec.entities.length;
writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2));
// 3. LLM-koodigenerointi
console.log(` [3/5] Koodigenerointi (LLM)...`);
const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all 4 files. Follow the reference implementation patterns exactly.`;
result.promptChars = CODE_SYSTEM.length + codePrompt.length;
result.promptTokensEst = Math.round(result.promptChars / 4);
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, 8192);
timings.push(codeResp);
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
const files = parseGeneratedFiles(codeResp.text);
const required = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
const missing = required.filter(f => !files[f]);
if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
// 4. Validointi + korjaussilmukka
let issues = validateProjectCode(files);
let fixRound = 0;
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
fixRound++;
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
const issuesByFile = {};
for (const issue of issues) {
const m = issue.match(/^ISSUE:\s*(\S+?):/);
const fname = m ? m[1] : 'unknown';
if (!issuesByFile[fname]) issuesByFile[fname] = [];
issuesByFile[fname].push(issue);
}
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
if (!files[fname]) continue;
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
timings.push(fixResp);
if (fixResp.text) {
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
}
}
issues = validateProjectCode(files);
}
result.validationIssues = issues.length;
result.fixRounds = fixRound;
// Kirjoita LLM:n generoimat Python-tiedostot
for (const [fn, content] of Object.entries(files)) {
if (fn.endsWith('.py')) writeFileSync(`${dir}/${fn}`, content);
}
// 5. Pytest Docker-kontissa (kipina-pytest image)
console.log(` [5/5] Pytest (Docker)...`);
try {
const pytestOut = execSync(
`docker run --rm -v "${dir}:/src:ro" kipina-pytest 2>&1`,
{ timeout: 120000, encoding: 'utf-8' }
);
writeFileSync(`${dir}/_pytest.txt`, pytestOut);
const passedMatch = pytestOut.match(/(\d+) passed/);
const failedMatch = pytestOut.match(/(\d+) failed/);
result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0;
result.testsTotal = result.testsPassed + result.testsFailed;
} catch (e) {
const output = e.stdout || e.stderr || e.message || '';
writeFileSync(`${dir}/_pytest.txt`, output);
const passedMatch = output.match(/(\d+) passed/);
const failedMatch = output.match(/(\d+) failed/);
const errorMatch = output.match(/(\d+) error/);
result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0);
result.testsTotal = result.testsPassed + result.testsFailed;
if (result.testsTotal === 0) result.error = 'Pytest kaatui';
}
} catch (e) {
result.error = e.message;
}
// Yhteenveto
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
result.score = scoreResult(result);
result.stars = starsForScore(result.score);
return result;
}
// === Main ===
async function main() {
console.log('╔══════════════════════════════════════════════╗');
console.log('║ Kipinä CodeBench ║');
console.log('╚══════════════════════════════════════════════╝');
console.log(`Ollama: ${OLLAMA_URL}`);
// Haetaan mallit
let models;
try {
models = await ollamaListModels();
} catch (e) {
console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`);
process.exit(1);
}
if (FILTER_MODELS) {
const filter = FILTER_MODELS.split(',').map(s => s.trim());
models = models.filter(m => filter.some(f => m.includes(f)));
}
console.log(`Mallit (${models.length}): ${models.join(', ')}`);
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
console.log(`Tulokset: ${OUTPUT_DIR}/`);
console.log('');
// Puhdista output
rmSync(OUTPUT_DIR, { recursive: true, force: true });
mkdirSync(OUTPUT_DIR, { recursive: true });
const results = [];
for (const model of models) {
for (const scenario of scenarios) {
console.log(`\n━━━ ${model} × ${scenario.id} ━━━`);
const r = await runPipeline(model, scenario);
results.push(r);
const status = r.error ? `${r.error}` :
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` :
`${r.testsPassed}/${r.testsTotal}`;
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
console.log(`${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
}
}
// === Tulostaulu ===
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
console.log('║ TULOKSET ║');
console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣');
const header = [
'Malli'.padEnd(40),
'Skenaario'.padEnd(10),
'Speksi'.padEnd(8),
'Testit'.padEnd(10),
'Korjaus'.padEnd(8),
'Ctx'.padEnd(7),
'Aika'.padEnd(8),
'tok/s'.padEnd(8),
'Pisteet',
].join(' │ ');
console.log(`${header}`);
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
for (const r of results) {
const specStatus = r.specOk ? `${r.specEntities}e` : '✗';
const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-';
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
const speed = `${r.avgTokPerSec.toFixed(0)}`;
const row = [
r.model.padEnd(40),
r.scenario.padEnd(10),
specStatus.padEnd(8),
testStatus.padEnd(10),
fixStatus.padEnd(8),
ctx.padEnd(7),
time.padEnd(8),
speed.padEnd(8),
`${r.stars} ${r.score}`,
].join(' │ ');
console.log(`${row}`);
}
console.log('╚' + '═'.repeat(header.length + 2) + '╝');
// === Mallikohtainen yhteenveto ===
const modelNames = [...new Set(results.map(r => r.model))];
const scenarioIds = scenarios.map(s => s.id);
console.log('\n');
const mHeader = [
'Malli'.padEnd(35),
...scenarioIds.map(s => s.padEnd(22)),
'Yht.'.padEnd(8),
'Out'.padEnd(7),
'Aika'.padEnd(8),
'tok/s'.padEnd(7),
'Pisteet',
].join(' │ ');
console.log(mHeader);
console.log('─'.repeat(mHeader.length));
for (const model of modelNames) {
const mrs = results.filter(r => r.model === model);
const cols = scenarioIds.map(sid => {
const r = mrs.find(r => r.scenario === sid);
if (!r) return '-'.padEnd(22);
const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-';
const s = `${(r.totalDurationMs/1000).toFixed(0)}s`;
const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`;
return `${t} ${s} ${tok}`.padEnd(22);
});
const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0);
const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0);
const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0);
const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0);
const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0;
const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0;
const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0;
const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`;
const row = [
model.padEnd(35),
...cols,
`${totalPassed}/${totalTests}`.padEnd(8),
tokStr.padEnd(7),
`${(totalTime/1000).toFixed(0)}s`.padEnd(8),
`${avgSpeed}`.padEnd(7),
`${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`,
].join(' │ ');
console.log(row);
}
// Tallenna JSON + HTML-raportti
writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2));
const templatePath = join(__dirname, 'report-template.html');
if (existsSync(templatePath)) {
const html = readFileSync(templatePath, 'utf-8').replace(
'/*DATA_PLACEHOLDER*/[]',
JSON.stringify(results)
);
writeFileSync(`${OUTPUT_DIR}/report.html`, html);
console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`);
}
console.log(`JSON: ${OUTPUT_DIR}/results.json`);
// Yhteenveto
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
const failed = results.filter(r => r.error || r.testsTotal === 0);
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
}
main().catch(e => { console.error(e); process.exit(1); });