#!/usr/bin/env node /** * Kipinä CodeBench — LLM-koodingenerointibenchmark * * Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa. * * Käyttö: * node benchmark.mjs # kaikki mallit, oletusskenaario * node benchmark.mjs --models qwen3-coder:30b # yksi malli * node benchmark.mjs --ollama http://host:11434 # eri Ollama * node benchmark.mjs --scenarios all # kaikki skenaariot * node benchmark.mjs --output ./results/run-001 # custom output-hakemisto */ import { execSync } from 'child_process'; import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs'; import { dirname, join } from 'path'; import { fileURLToPath } from 'url'; const __dirname = dirname(fileURLToPath(import.meta.url)); // === CLI-argumentit === const args = process.argv.slice(2); function arg(name, fallback) { const i = args.indexOf(`--${name}`); return i >= 0 && args[i + 1] ? args[i + 1] : fallback; } const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434'); const HUB_URL = arg('hub', ''); const FILTER_MODELS = arg('models', ''); const SCENARIO_FILTER = arg('scenarios', 'default'); const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 16); const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`); const RESULTS_DIR = join(__dirname, 'results'); const THINK_MODE = args.includes('--think'); const COMPACT_MODE = args.includes('--compact'); const LANG = arg('lang', 'python'); // python | rust const MAX_FIX_ROUNDS = 2; // === Promptien lataus tiedostoista === function loadPrompt(name) { const path = join(__dirname, 'prompts', `${name}.md`); if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`); return readFileSync(path, 'utf-8').trim(); } const CLIENT_SYSTEM = loadPrompt('client'); const SPEC_SYSTEM = loadPrompt('spec'); const CODE_SYSTEM = loadPrompt(LANG === 'rust' ? 'code-rs' : 'code'); const FIX_SYSTEM = loadPrompt('fix'); // === Kultaisten esimerkkien lataus (kielen mukaan) === const GOLDEN_DIR = join(__dirname, 'golden-examples'); const LANG_CONFIG = { python: { goldenDir: 'todo', files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'], required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'], dockerImage: 'kipina-pytest', }, rust: { goldenDir: 'todo-rs', files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'], required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'], dockerImage: 'kipina-cargo-test', }, }; const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python; function loadGoldenExample() { // --compact: käytä tiivistettyä templaattia täyden koodin sijaan if (COMPACT_MODE) { const compactFile = LANG === 'rust' ? 'golden-compact-rs.md' : 'golden-compact-py.md'; const compactPath = join(__dirname, 'prompts', compactFile); if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n'; } // Täysi golden example const todoDir = join(GOLDEN_DIR, LCONF.goldenDir); if (!existsSync(todoDir)) return ''; let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`; for (const f of LCONF.files) { const path = join(todoDir, f); if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`; } return example; } const GOLDEN_EXAMPLE = loadGoldenExample(); // === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) === function stripThinking(text) { return text .replace(/<\|channel>thought[\s\S]*?/g, '') // gemma4 .replace(/[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5 .trim(); } // === Ollama / Hub -client === async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) { const start = Date.now(); if (HUB_URL) { const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`; const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }), }); if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`); const data = await resp.json(); const elapsed = Date.now() - start; return { text: stripThinking((data.response || '').trim()), tokens: data.tokens_generated || 0, durationMs: elapsed, tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000), }; } // Suora Ollama-reitti const messages = []; if (systemPrompt) messages.push({ role: 'system', content: systemPrompt }); messages.push({ role: 'user', content: prompt }); const resp = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, messages, stream: false, think: THINK_MODE, options: { num_predict: THINK_MODE ? maxTokens * 3 : maxTokens, num_ctx: 16384, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 }, }), }); if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`); const data = await resp.json(); const elapsed = Date.now() - start; const rawContent = (data.message?.content || '').trim(); const thinking = (data.message?.thinking || '').trim(); const text = stripThinking(rawContent || thinking); const evalCount = data.eval_count || 0; if (!rawContent && thinking) console.log(` ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`); const evalDurationNs = data.eval_duration || 1; const tokPerSec = evalCount / (evalDurationNs / 1e9); return { text, tokens: evalCount, durationMs: elapsed, tokPerSec }; } async function ollamaListModels() { const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`; const resp = await fetch(url); if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`); const data = await resp.json(); return (data.models || []).map(m => m.name); } // === Testitulosten parsinta (pytest + cargo test) === function parseTestOutput(output) { // Pytest: "6 passed", "2 failed", "1 error" const pyPassed = output.match(/(\d+) passed/); const pyFailed = output.match(/(\d+) failed/); const pyError = output.match(/(\d+) error/); if (pyPassed || pyFailed) { const passed = pyPassed ? parseInt(pyPassed[1]) : 0; const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0); return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed }; } // Cargo test: "test result: ok. 10 passed; 0 failed;" const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/); if (cargoMatch) { const passed = parseInt(cargoMatch[1]); const failed = parseInt(cargoMatch[2]); return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed }; } // Cargo compilation error: count "error[E" occurrences const compileErrors = (output.match(/error\[E\d+\]/g) || []).length; if (compileErrors > 0) { return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors }; } return { testsPassed: 0, testsFailed: 0, testsTotal: 0 }; } // === Tiedostoparseri LLM-vastauksesta === function parseGeneratedFiles(text) { const files = {}; const sections = text.split(/===\s*(\S+\.(?:py|toml|rs))\s*===/); for (let i = 1; i < sections.length - 1; i += 2) { const name = sections[i]; let content = sections[i + 1].trim(); content = content.replace(/^```(?:python|toml|rust)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim(); if (content) files[name] = content + '\n'; } return files; } // === Validaattori === function validateProjectCode(files) { const issues = []; for (const [fname, code] of Object.entries(files)) { if (!fname.endsWith('.py')) continue; const lines = code.split('\n'); for (const line of lines) { if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`); } for (const line of lines) { const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/); if (!m) continue; const srcCode = files[m[1] + '.py']; if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; } const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim()); for (const name of names) { if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`); } } if (fname === 'schemas.py') { if (/:\s*date\b/.test(code) && !/from datetime import/.test(code)) issues.push('ISSUE: schemas.py: date-import puuttuu'); if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code)) issues.push('ISSUE: schemas.py: datetime-import puuttuu'); } for (let i = 0; i < lines.length; i++) { const line = lines[i]; if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue; if (/(? 0) score += 10; if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60); score += Math.max(0, 20 - r.fixRounds * 10); return Math.min(100, score); } function starsForScore(score) { if (score >= 90) return '★★★★★'; if (score >= 70) return '★★★★☆'; if (score >= 50) return '★★★☆☆'; if (score >= 25) return '★★☆☆☆'; if (score > 0) return '★☆☆☆☆'; return '☆☆☆☆☆'; } // === Pipeline: yhdelle mallille ja skenaariolle === async function runPipeline(model, scenario) { const result = { model, scenario: scenario.id, reqOk: false, specOk: false, specEntities: 0, validationIssues: 0, fixRounds: 0, testsTotal: 0, testsPassed: 0, testsFailed: 0, totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0, promptChars: 0, promptTokensEst: 0, score: 0, stars: '', error: null, }; const timings = []; const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`; mkdirSync(dir, { recursive: true }); try { // 1. Vaatimukset console.log(` [1/5] Vaatimukset...`); const req = await ollamaChat(model, scenario.prompt, CLIENT_SYSTEM, 2048); timings.push(req); if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; } result.reqOk = true; writeFileSync(`${dir}/_requirements.txt`, req.text); // 2. JSON-speksi console.log(` [2/5] JSON-speksi...`); const specResp = await ollamaChat(model, `${req.text}\n\nOutput a JSON spec for this project.`, SPEC_SYSTEM, 4096); timings.push(specResp); const spec = extractJson(specResp.text); if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; } result.specOk = true; result.specEntities = spec.entities.length; writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2)); // 3. LLM-koodigenerointi console.log(` [3/5] Koodigenerointi (LLM)...`); const fileCount = LCONF.required.length; const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`; result.promptChars = CODE_SYSTEM.length + codePrompt.length; result.promptTokensEst = Math.round(result.promptChars / 4); const codeTokens = LANG === 'rust' ? 12288 : 8192; const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens); timings.push(codeResp); writeFileSync(`${dir}/_code_raw.txt`, codeResp.text); const files = parseGeneratedFiles(codeResp.text); const missing = LCONF.required.filter(f => !files[f]); if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; } // 4. Validointi + korjaussilmukka (Python-spesifi) let fixRound = 0; if (LANG === 'python') { let issues = validateProjectCode(files); while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) { fixRound++; console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`); const issuesByFile = {}; for (const issue of issues) { const m = issue.match(/^ISSUE:\s*(\S+?):/); const fname = m ? m[1] : 'unknown'; if (!issuesByFile[fname]) issuesByFile[fname] = []; issuesByFile[fname].push(issue); } for (const [fname, fIssues] of Object.entries(issuesByFile)) { if (!files[fname]) continue; const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``; const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048); timings.push(fixResp); if (fixResp.text) { files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n'; } } issues = validateProjectCode(files); } result.validationIssues = issues.length; } result.fixRounds = fixRound; // Kirjoita LLM:n generoimat tiedostot (luo src/ ja tests/ alihakemistot tarvittaessa) for (const [fn, content] of Object.entries(files)) { const filePath = join(dir, fn); mkdirSync(dirname(filePath), { recursive: true }); writeFileSync(filePath, content); } // 5. Testit Docker-kontissa const testLabel = LANG === 'rust' ? 'Cargo test (Docker)' : 'Pytest (Docker)'; console.log(` [5/5] ${testLabel}...`); const dockerTimeout = LANG === 'rust' ? 300000 : 120000; try { const testOut = execSync( `docker run --rm -v "${dir}:/src:ro" ${LCONF.dockerImage} 2>&1`, { timeout: dockerTimeout, encoding: 'utf-8' } ); writeFileSync(`${dir}/_testout.txt`, testOut); Object.assign(result, parseTestOutput(testOut)); } catch (e) { const output = e.stdout || e.stderr || e.message || ''; writeFileSync(`${dir}/_testout.txt`, output); Object.assign(result, parseTestOutput(output)); if (result.testsTotal === 0) result.error = 'Testit kaatuivat'; } } catch (e) { result.error = e.message; } // Yhteenveto result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0); result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0); result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0; result.score = scoreResult(result); result.stars = starsForScore(result.score); return result; } // === Main === async function main() { console.log('╔══════════════════════════════════════════════╗'); console.log('║ Kipinä CodeBench ║'); console.log('╚══════════════════════════════════════════════╝'); console.log(`Ollama: ${OLLAMA_URL} 📝 ${LANG}${COMPACT_MODE ? ' (compact)' : ''}${THINK_MODE ? ' 🧠 thinking ON' : ''}`); // Haetaan mallit let models; try { models = await ollamaListModels(); } catch (e) { console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`); process.exit(1); } if (FILTER_MODELS) { const filter = FILTER_MODELS.split(',').map(s => s.trim()); models = models.filter(m => filter.some(f => m.includes(f))); } console.log(`Mallit (${models.length}): ${models.join(', ')}`); const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]]; console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`); console.log(`Tulokset: ${OUTPUT_DIR}/`); console.log(''); // Puhdista output rmSync(OUTPUT_DIR, { recursive: true, force: true }); mkdirSync(OUTPUT_DIR, { recursive: true }); const results = []; for (const model of models) { for (const scenario of scenarios) { console.log(`\n━━━ ${model} × ${scenario.id} ━━━`); const r = await runPipeline(model, scenario); results.push(r); const status = r.error ? `✗ ${r.error}` : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` : `◐ ${r.testsPassed}/${r.testsTotal}`; const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : ''; console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`); } } // === Tulostaulu === console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗'); console.log('║ TULOKSET ║'); console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣'); const header = [ 'Malli'.padEnd(40), 'Skenaario'.padEnd(10), 'Speksi'.padEnd(8), 'Testit'.padEnd(10), 'Korjaus'.padEnd(8), 'Ctx'.padEnd(7), 'Aika'.padEnd(8), 'tok/s'.padEnd(8), 'Pisteet', ].join(' │ '); console.log(`║ ${header} ║`); console.log('╠' + '═'.repeat(header.length + 2) + '╣'); for (const r of results) { const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗'; const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-'; const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-'; const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-'; const time = `${(r.totalDurationMs/1000).toFixed(0)}s`; const speed = `${r.avgTokPerSec.toFixed(0)}`; const row = [ r.model.padEnd(40), r.scenario.padEnd(10), specStatus.padEnd(8), testStatus.padEnd(10), fixStatus.padEnd(8), ctx.padEnd(7), time.padEnd(8), speed.padEnd(8), `${r.stars} ${r.score}`, ].join(' │ '); console.log(`║ ${row} ║`); } console.log('╚' + '═'.repeat(header.length + 2) + '╝'); // === Mallikohtainen yhteenveto === const modelNames = [...new Set(results.map(r => r.model))]; const scenarioIds = scenarios.map(s => s.id); console.log('\n'); const mHeader = [ 'Malli'.padEnd(35), ...scenarioIds.map(s => s.padEnd(22)), 'Yht.'.padEnd(8), 'Out'.padEnd(7), 'Aika'.padEnd(8), 'tok/s'.padEnd(7), 'Pisteet', ].join(' │ '); console.log(mHeader); console.log('─'.repeat(mHeader.length)); for (const model of modelNames) { const mrs = results.filter(r => r.model === model); const cols = scenarioIds.map(sid => { const r = mrs.find(r => r.scenario === sid); if (!r) return '-'.padEnd(22); const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-'; const s = `${(r.totalDurationMs/1000).toFixed(0)}s`; const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`; return `${t} ${s} ${tok}`.padEnd(22); }); const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0); const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0); const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0); const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0); const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0; const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0; const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0; const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`; const row = [ model.padEnd(35), ...cols, `${totalPassed}/${totalTests}`.padEnd(8), tokStr.padEnd(7), `${(totalTime/1000).toFixed(0)}s`.padEnd(8), `${avgSpeed}`.padEnd(7), `${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`, ].join(' │ '); console.log(row); } // Tallenna JSON + HTML-raportti const jsonData = JSON.stringify(results, null, 2); writeFileSync(`${OUTPUT_DIR}/results.json`, jsonData); const templatePath = join(__dirname, 'report-template.html'); let htmlData = ''; if (existsSync(templatePath)) { htmlData = readFileSync(templatePath, 'utf-8').replace('/*DATA_PLACEHOLDER*/[]', JSON.stringify(results)); writeFileSync(`${OUTPUT_DIR}/report.html`, htmlData); console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`); } console.log(`JSON: ${OUTPUT_DIR}/results.json`); // Kopioi results/-kansioon aikaleimalla mkdirSync(RESULTS_DIR, { recursive: true }); writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.json`), jsonData); if (htmlData) writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.html`), htmlData); console.log(`Arkistoitu: results/${TIMESTAMP}.json`); // Yhteenveto const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0); const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0); const failed = results.filter(r => r.error || r.testsTotal === 0); const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0; const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0); console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`); } main().catch(e => { console.error(e); process.exit(1); });