#!/usr/bin/env node /** * Kipinä CodeBench — LLM-koodingenerointibenchmark * * Generoi FastAPI-projekteja Ollama-malleilla ja testaa pytest:llä Docker-kontissa. * * Käyttö: * node benchmark.mjs # kaikki mallit, oletusskenaario * node benchmark.mjs --models qwen3-coder:30b # yksi malli * node benchmark.mjs --ollama http://host:11434 # eri Ollama * node benchmark.mjs --scenarios all # kaikki skenaariot * node benchmark.mjs --output ./results/run-001 # custom output-hakemisto */ import { execSync } from 'child_process'; import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs'; import { dirname, join } from 'path'; import { fileURLToPath } from 'url'; const __dirname = dirname(fileURLToPath(import.meta.url)); // === CLI-argumentit === const args = process.argv.slice(2); function arg(name, fallback) { const i = args.indexOf(`--${name}`); return i >= 0 && args[i + 1] ? args[i + 1] : fallback; } const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://127.0.0.1:11434'); const HUB_URL = arg('hub', ''); const FILTER_MODELS = arg('models', ''); const SCENARIO_FILTER = arg('scenarios', 'default'); const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`); const RESULTS_DIR = join(__dirname, 'results'); const THINK_MODE = args.includes('--think'); const COMPACT_MODE = args.includes('--compact'); const NO_ORCHESTRATE = args.includes('--no-orchestrate'); const SPEC_MODEL = arg('spec-model', ''); // Eri malli spec-vaiheille (1-2) const SPEC_OLLAMA = arg('spec-ollama', ''); // Eri Ollama spec-mallille const LANG = arg('lang', 'python'); // python | rust | go const ROUNDS = parseInt(arg('rounds', '1')); // 1-10 toistoa const MAX_FIX_ROUNDS = 2; // === Promptien lataus tiedostoista === function loadPrompt(name) { const path = join(__dirname, 'prompts', `${name}.md`); if (!existsSync(path)) throw new Error(`Prompti puuttuu: ${path}`); return readFileSync(path, 'utf-8').trim(); } const CLIENT_SYSTEM = loadPrompt('client'); const SPEC_SYSTEM = loadPrompt('spec'); const SPEC_SIMPLE_SYSTEM = existsSync(join(__dirname, 'prompts', 'spec-simple.md')) ? loadPrompt('spec-simple') : SPEC_SYSTEM; const FIX_SYSTEM = loadPrompt('fix'); // === Mallikohtaiset profiilit === const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8')); function getGoldenForModel(model) { const modelConf = PROFILES.models[model]; const goldenFile = modelConf?.golden || 'todo.md'; // Kielispesifi golden: todo.md → todo-rs.md / todo-go.md const langSuffix = { rust: '-rs', go: '-go' }[LANG]; if (langSuffix) { const langFile = goldenFile.replace(/\.md$/, `${langSuffix}.md`); const langPath = join(GOLDEN_DIR, langFile); if (existsSync(langPath)) return langFile; } return goldenFile; } function getCodePromptForModel(model) { const modelConf = PROFILES.models[model]; const profile = modelConf?.profile || PROFILES.default_profile; const promptName = modelConf?.prompt || PROFILES.profiles[profile]?.prompt || 'code'; const suffix = { rust: '-rs', go: '-go' }[LANG] || ''; // Kielisuffiksi priorisoituu: code-small-go > code-go > code-small > code const candidates = [`${promptName}${suffix}`, `code${suffix}`, promptName, 'code'].filter(Boolean); for (const name of candidates) { const path = join(__dirname, 'prompts', `${name}.md`); if (existsSync(path)) return { system: readFileSync(path, 'utf-8').trim(), promptName: name, profile }; } return { system: loadPrompt('code'), promptName: 'code', profile: 'large' }; } // === Kultaisten esimerkkien lataus (kielen mukaan) === const GOLDEN_DIR = join(__dirname, 'golden-examples'); const LANG_CONFIG = { python: { goldenDir: 'todo', files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'], required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'], dockerImage: 'kipina-pytest', }, rust: { goldenDir: 'todo-rs', files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'], required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'], dockerImage: 'kipina-cargo-test', }, go: { goldenDir: 'todo-go', files: ['go.mod', 'models.go', 'handlers.go', 'main.go', 'handlers_test.go'], required: ['go.mod', 'models.go', 'handlers.go', 'main.go', 'handlers_test.go'], dockerImage: 'kipina-go-test', }, }; const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python; function loadGoldenExample(model) { // --compact: käytä tiivistettyä templaattia if (COMPACT_MODE) { const compactFile = { rust: 'golden-compact-rs.md', go: 'golden-compact-go.md' }[LANG] || 'golden-compact-py.md'; const compactPath = join(__dirname, 'prompts', compactFile); if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n'; } // Mallikohtainen golden example profiilista const goldenFile = model ? getGoldenForModel(model) : ({ rust: 'todo-rs.md', go: 'todo-go.md' }[LANG] || 'todo.md'); const mdPath = join(GOLDEN_DIR, goldenFile); if (existsSync(mdPath)) return '\n' + readFileSync(mdPath, 'utf-8').trim() + '\n'; // Fallback: erilliset tiedostot const todoDir = join(GOLDEN_DIR, LCONF.goldenDir); if (!existsSync(todoDir)) return ''; let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`; for (const f of LCONF.files) { const path = join(todoDir, f); if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`; } return example; } // === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) === function stripThinking(text) { return text .replace(/<\|channel>thought[\s\S]*?/g, '') // gemma4 .replace(/[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5 .trim(); } // === Ollama / Hub -client === async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048, ollamaUrl = null) { const start = Date.now(); if (HUB_URL) { const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`; const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }), }); if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`); const data = await resp.json(); const elapsed = Date.now() - start; return { text: stripThinking((data.response || '').trim()), tokens: data.tokens_generated || 0, durationMs: elapsed, tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000), }; } // Suora Ollama-reitti const messages = []; if (systemPrompt) messages.push({ role: 'system', content: systemPrompt }); messages.push({ role: 'user', content: prompt }); const resp = await fetch(`${ollamaUrl || OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, messages, stream: false, think: THINK_MODE, options: { num_predict: THINK_MODE ? maxTokens * 3 : maxTokens, num_ctx: 16384, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 }, }), }); if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`); const data = await resp.json(); const elapsed = Date.now() - start; const rawContent = (data.message?.content || '').trim(); const thinking = (data.message?.thinking || '').trim(); const text = stripThinking(rawContent || thinking); const evalCount = data.eval_count || 0; if (!rawContent && thinking) console.log(` ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`); const evalDurationNs = data.eval_duration || 1; const tokPerSec = evalCount / (evalDurationNs / 1e9); return { text, tokens: evalCount, durationMs: elapsed, tokPerSec }; } async function ollamaListModels() { const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`; const resp = await fetch(url); if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`); const data = await resp.json(); return (data.models || []).map(m => m.name); } // === Testitulosten parsinta (pytest + cargo test) === function parseTestOutput(output) { // Pytest: "6 passed", "2 failed", "1 error" const pyPassed = output.match(/(\d+) passed/); const pyFailed = output.match(/(\d+) failed/); const pyError = output.match(/(\d+) error/); if (pyPassed || pyFailed) { const passed = pyPassed ? parseInt(pyPassed[1]) : 0; const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0); return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed }; } // Cargo test: "test result: ok. 10 passed; 0 failed;" const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/); if (cargoMatch) { const passed = parseInt(cargoMatch[1]); const failed = parseInt(cargoMatch[2]); return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed }; } // Go test: "--- PASS:" / "--- FAIL:" const goPassed = (output.match(/--- PASS:/g) || []).length; const goFailed = (output.match(/--- FAIL:/g) || []).length; if (goPassed + goFailed > 0) { return { testsPassed: goPassed, testsFailed: goFailed, testsTotal: goPassed + goFailed }; } // Cargo/Go compilation error: count "error[E" or Go compile errors const compileErrors = (output.match(/error\[E\d+\]/g) || []).length; if (compileErrors > 0) { return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors }; } return { testsPassed: 0, testsFailed: 0, testsTotal: 0 }; } // === Tiedostoparseri LLM-vastauksesta === function parseGeneratedFiles(text) { const files = {}; const sections = text.split(/===\s*(\S+\.(?:py|toml|rs|go|mod))\s*===/); for (let i = 1; i < sections.length - 1; i += 2) { const name = sections[i]; let content = sections[i + 1].trim(); content = content.replace(/^```(?:python|toml|rust|go|gomod)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim(); if (content) files[name] = content + '\n'; } return files; } // === Validaattori === function validateProjectCode(files) { const issues = []; for (const [fname, code] of Object.entries(files)) { if (!fname.endsWith('.py')) continue; const lines = code.split('\n'); for (const line of lines) { if (/^from\s+\.(\w*)\s+import/.test(line)) issues.push(`ISSUE: ${fname}: relatiivinen import`); } for (const line of lines) { const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/); if (!m) continue; const srcCode = files[m[1] + '.py']; if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; } const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim()); for (const name of names) { if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`); } } if (fname === 'schemas.py') { if (/:\s*date\b/.test(code) && !/from datetime import/.test(code)) issues.push('ISSUE: schemas.py: date-import puuttuu'); if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code)) issues.push('ISSUE: schemas.py: datetime-import puuttuu'); } for (let i = 0; i < lines.length; i++) { const line = lines[i]; if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue; if (/(? 0) score += 10; if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60); score += Math.max(0, 20 - r.fixRounds * 10); return Math.min(100, score); } function starsForScore(score) { if (score >= 90) return '★★★★★'; if (score >= 70) return '★★★★☆'; if (score >= 50) return '★★★☆☆'; if (score >= 25) return '★★☆☆☆'; if (score > 0) return '★☆☆☆☆'; return '☆☆☆☆☆'; } // === Pipeline: yhdelle mallille ja skenaariolle === async function runPipeline(model, scenario, round = 1) { const result = { model, scenario: scenario.id, reqOk: false, specOk: false, specEntities: 0, validationIssues: 0, fixRounds: 0, testsTotal: 0, testsPassed: 0, testsFailed: 0, totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0, promptChars: 0, promptTokensEst: 0, score: 0, stars: '', error: null, }; const timings = []; const { system: CODE_SYSTEM, promptName, profile } = getCodePromptForModel(model); const roundSuffix = ROUNDS > 1 ? `__r${round}` : ''; const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}${roundSuffix}`; mkdirSync(dir, { recursive: true }); try { // 1. Vaatimukset const specModel = SPEC_MODEL || model; console.log(` [1/5] Vaatimukset${SPEC_MODEL ? ` (${SPEC_MODEL})` : ''}...`); const specUrl = SPEC_OLLAMA || null; const req = await ollamaChat(specModel, scenario.prompt, CLIENT_SYSTEM, 2048, specUrl); timings.push(req); if (!req.text || req.text.length < 50) { result.error = 'Vaatimukset liian lyhyet'; return result; } result.reqOk = true; writeFileSync(`${dir}/_requirements.txt`, req.text); // 2. JSON-speksi (small-malleille yksinkertaistettu skeema) const specProfile = PROFILES.models[specModel]?.profile || PROFILES.default_profile; const specPrompt = specProfile === 'small' ? SPEC_SIMPLE_SYSTEM : SPEC_SYSTEM; console.log(` [2/5] JSON-speksi${specProfile === 'small' ? ' (simple)' : ''}...`); const specResp = await ollamaChat(specModel, `${req.text}\n\nOutput a JSON spec for this project.`, specPrompt, 4096, specUrl); timings.push(specResp); const spec = extractJson(specResp.text); if (!spec || !spec.entities || spec.entities.length === 0) { result.error = 'JSON-speksi epäonnistui'; writeFileSync(`${dir}/_spec_raw.txt`, specResp.text); return result; } result.specOk = true; result.specEntities = spec.entities.length; writeFileSync(`${dir}/_spec.json`, JSON.stringify(spec, null, 2)); // 3. LLM-koodigenerointi const fileCount = LCONF.required.length; const goldenExample = loadGoldenExample(model); const codeTokens = LANG === 'rust' ? 12288 : LANG === 'go' ? 10240 : 8192; let files; // Orkestrointi: pilko entiteetti kerrallaan pienille malleille if (spec.entities.length > 1 && !NO_ORCHESTRATE) { console.log(` [3/5] Koodigenerointi (orkestroitu, ${spec.entities.length} entiteettiä)...`); files = {}; let cumulativeCode = ''; for (let ei = 0; ei < spec.entities.length; ei++) { const entity = spec.entities[ei]; const isFirst = ei === 0; const entitySpec = { ...spec, entities: spec.entities.slice(0, ei + 1), relationships: (spec.relationships || []).filter(r => spec.entities.slice(0, ei + 1).some(e => e.name === r.from) ), }; let entityPrompt; if (isFirst) { entityPrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(entitySpec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files for the entity "${entity.name}". Follow the reference implementation patterns exactly.`; } else { entityPrompt = `${goldenExample}\n---\n\nEXISTING CODE (do not regenerate, only add to it):\n${cumulativeCode}\n\n---\n\nJSON SPECIFICATION (add entity "${entity.name}"):\n${JSON.stringify(entitySpec, null, 2)}\n\nAdd the entity "${entity.name}" to the existing code. Return ALL ${fileCount} files with === markers, including the existing entities. Follow the same patterns.`; } console.log(` [3/5] → ${entity.name}${isFirst ? '' : ' (+ ' + spec.entities.slice(0, ei).map(e => e.name).join(', ') + ')'}...`); const entityResp = await ollamaChat(model, entityPrompt, CODE_SYSTEM, codeTokens); timings.push(entityResp); const entityFiles = parseGeneratedFiles(entityResp.text); // Yhdistä — uudempi korvaa edellisen for (const [fn, content] of Object.entries(entityFiles)) { files[fn] = content; } cumulativeCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n'); } writeFileSync(`${dir}/_code_raw.txt`, cumulativeCode); result.promptChars = CODE_SYSTEM.length + cumulativeCode.length; result.promptTokensEst = Math.round(result.promptChars / 4); } else { // Normaali: kaikki kerralla console.log(` [3/5] Koodigenerointi (LLM)...`); const codePrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`; result.promptChars = CODE_SYSTEM.length + codePrompt.length; result.promptTokensEst = Math.round(result.promptChars / 4); const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens); timings.push(codeResp); writeFileSync(`${dir}/_code_raw.txt`, codeResp.text); files = parseGeneratedFiles(codeResp.text); } const missing = LCONF.required.filter(f => !files[f]); if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; } // Go: korvaa go.mod aina golden examplen versiolla (pienet mallit eivät tuota luotettavaa go.modia) if (LANG === 'go' && files['go.mod']) { const goldenMod = readFileSync(join(GOLDEN_DIR, 'todo-go', 'go.mod'), 'utf-8'); const modName = files['go.mod'].match(/^module\s+(\S+)/m)?.[1] || 'generated-api'; files['go.mod'] = goldenMod.replace(/^module\s+\S+/m, `module ${modName}`); } // 4. Validointi + korjaussilmukka let fixRound = 0; if (LANG === 'rust') { // Rust: cargo check Docker-kontissa ennen testejä for (let checkRound = 0; checkRound < MAX_FIX_ROUNDS; checkRound++) { // Kirjoita tiedostot levylle for (const [fn, content] of Object.entries(files)) { const filePath = join(dir, fn); mkdirSync(dirname(filePath), { recursive: true }); writeFileSync(filePath, content); } console.log(` [4/5] Cargo check${checkRound > 0 ? ` (korjaus ${checkRound})` : ''}...`); let checkOut = ''; try { checkOut = execSync( `docker run --rm --entrypoint sh -v "${dir}:/src:ro" -v kipina-cargo-registry:/usr/local/cargo/registry -v kipina-cargo-target:/work/target ${LCONF.dockerImage} -c "cp -r /src/* . && cargo check 2>&1"`, { timeout: 300000, encoding: 'utf-8' } ); } catch (e) { checkOut = e.stdout || e.stderr || e.message || ''; } const compileErrors = checkOut.split('\n').filter(l => /^error/.test(l)); if (compileErrors.length === 0) break; // Kääntyy — jatka testeihin console.log(` [4/5] ${compileErrors.length} käännösvirhettä — korjataan...`); fixRound++; const errorLines = checkOut.split('\n').filter(l => /^error|^\s+-->/.test(l)).slice(0, 30).join('\n'); const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n'); const fixPrompt = `Fix the following Rust compilation errors. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`; const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, 12288); timings.push(fixResp); const fixedFiles = parseGeneratedFiles(fixResp.text); for (const [fn, content] of Object.entries(fixedFiles)) { if (LCONF.required.includes(fn)) files[fn] = content; } } } if (LANG === 'python') { let issues = validateProjectCode(files); while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) { fixRound++; console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`); const issuesByFile = {}; for (const issue of issues) { const m = issue.match(/^ISSUE:\s*(\S+?):/); const fname = m ? m[1] : 'unknown'; if (!issuesByFile[fname]) issuesByFile[fname] = []; issuesByFile[fname].push(issue); } for (const [fname, fIssues] of Object.entries(issuesByFile)) { if (!files[fname]) continue; const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``; const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048); timings.push(fixResp); if (fixResp.text) { files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n'; } } issues = validateProjectCode(files); } result.validationIssues = issues.length; } result.fixRounds = fixRound; // 5. Testit Docker-kontissa + itsekorjaava looppi (Taso 4) const testLabel = { rust: 'Cargo test', go: 'Go test', python: 'Pytest' }[LANG] || 'Test'; const dockerTimeout = LANG === 'rust' ? 300000 : 120000; const MAX_TEST_FIX = 3; let bestFiles = { ...files }; // Paras versio tiedostoista let bestPassed = -1; // Paras testitulos let testFixRounds = 0; // Erillinen laskuri testikorjauksille for (let testRound = 0; testRound <= MAX_TEST_FIX; testRound++) { // Kirjoita tiedostot levylle for (const [fn, content] of Object.entries(files)) { const filePath = join(dir, fn); mkdirSync(dirname(filePath), { recursive: true }); writeFileSync(filePath, content); } // Nopea staattinen analyysi ennen Docker-ajoa const pyFiles = Object.keys(files).filter(f => f.endsWith('.py')); if (LANG === 'python' && pyFiles.length > 0) { let syntaxErrors = ''; for (const f of pyFiles) { try { execSync(`python3 -c "import py_compile; py_compile.compile('${join(dir, f)}', doraise=True)"`, { timeout: 5000, encoding: 'utf-8', stdio: 'pipe' }); } catch (e) { syntaxErrors += `${f}: ${(e.stderr || e.message || '').split('\n').filter(l => l.includes('Error')).join('; ')}\n`; } } if (syntaxErrors) { console.log(` [5/5] ⚠ Syntaksivirhe — ohitetaan Docker`); writeFileSync(`${dir}/_testout_${testRound}.txt`, `SYNTAX ERRORS:\n${syntaxErrors}`); Object.assign(result, { testsPassed: 0, testsFailed: 1, testsTotal: 1 }); if (testRound >= MAX_TEST_FIX) { result.error = 'Syntaksivirhe'; break; } console.log(` [5/5] Itsekorjaus: syntaksi...`); const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n'); const fixPrompt = `Fix the following syntax errors. Return ALL files with === markers.\n\nERRORS:\n${syntaxErrors}\n\nCURRENT CODE:\n${allCode}`; const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 8192); timings.push(fixResp); const fixedFiles = parseGeneratedFiles(fixResp.text); for (const [fn, content] of Object.entries(fixedFiles)) { if (LCONF.required.includes(fn)) files[fn] = content; } testFixRounds++; continue; } } const roundLabel = testRound > 0 ? ` (korjaus ${testRound}/${MAX_TEST_FIX})` : ''; console.log(` [5/5] ${testLabel}${roundLabel}...`); let testOut = ''; try { testOut = execSync( `docker run --rm -v "${dir}:/src:ro" -v kipina-cargo-registry:/usr/local/cargo/registry -v kipina-cargo-target:/work/target ${LCONF.dockerImage} 2>&1`, { timeout: dockerTimeout, encoding: 'utf-8' } ); } catch (e) { testOut = e.stdout || e.stderr || e.message || ''; } writeFileSync(`${dir}/_testout_${testRound}.txt`, testOut); const testResult = parseTestOutput(testOut); Object.assign(result, testResult); // Seuraa parasta tulosta — revert jos korjaus huononsi if (result.testsPassed > bestPassed) { bestPassed = result.testsPassed; bestFiles = { ...files }; } else if (testRound > 0 && result.testsPassed < bestPassed) { console.log(` [5/5] ⚠ Korjaus huononsi (${result.testsPassed}/${result.testsTotal} < ${bestPassed}) — palautetaan paras versio`); files = { ...bestFiles }; Object.assign(result, { testsPassed: bestPassed }); break; } // Kaikki testit läpi → valmis if (result.testsTotal > 0 && result.testsPassed === result.testsTotal) break; // Viimeinen kierros tai ei enää korjausmahdollisuutta if (testRound >= MAX_TEST_FIX) { if (result.testsTotal === 0) result.error = 'Testit kaatuivat'; break; } // Itsekorjaus: syötä virhe + koodi mallille const errorLines = testOut.split('\n').filter(l => /^E |FAILED|ERROR|error\[E|--- FAIL|panic:|\.go:\d+/.test(l)).slice(0, 20).join('\n'); if (!errorLines) break; // Ei parsittavia virheitä console.log(` [5/5] Itsekorjaus: ${result.testsFailed || 'virhe'}...`); const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n'); const fixPrompt = `The following test errors occurred. Fix the code so ALL tests pass. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`; const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, LANG === 'rust' ? 12288 : LANG === 'go' ? 10240 : 8192); timings.push(fixResp); const fixedFiles = parseGeneratedFiles(fixResp.text); for (const [fn, content] of Object.entries(fixedFiles)) { if (LCONF.required.includes(fn)) files[fn] = content; } testFixRounds++; } // Kirjoita paras versio levylle for (const [fn, content] of Object.entries(bestPassed >= 0 ? bestFiles : files)) { const filePath = join(dir, fn); mkdirSync(dirname(filePath), { recursive: true }); writeFileSync(filePath, content); } // fixRounds = vain testikorjaukset (cargo check -korjaukset erilliset vaihe 4:ssä) result.fixRounds = testFixRounds; } catch (e) { result.error = e.message; } // Yhteenveto result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0); result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0); result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0; result.score = scoreResult(result); result.stars = starsForScore(result.score); result.profile = profile; result.promptName = promptName; return result; } // === GPU-muistin tyhjennys === async function clearVram() { try { const psResp = await fetch(`${OLLAMA_URL}/api/ps`); const psData = await psResp.json(); for (const m of (psData.models || [])) { await fetch(`${OLLAMA_URL}/api/generate`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: m.name, keep_alive: 0 }), }); console.log(` ♻ Vapautettu: ${m.name}`); } } catch (e) { /* ei kriittinen */ } } // === Main === async function main() { console.log('╔══════════════════════════════════════════════╗'); console.log('║ Kipinä CodeBench ║'); console.log('╚══════════════════════════════════════════════╝'); console.log(`Ollama: ${OLLAMA_URL} 📝 ${LANG}${COMPACT_MODE ? ' (compact)' : ''}${THINK_MODE ? ' 🧠 thinking ON' : ''}`); // Haetaan mallit let models; try { models = await ollamaListModels(); } catch (e) { console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`); process.exit(1); } if (FILTER_MODELS) { const filter = FILTER_MODELS.split(',').map(s => s.trim()); models = models.filter(m => filter.some(f => m.includes(f))); } console.log(`Mallit (${models.length}): ${models.join(', ')}`); const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : SCENARIOS.filter(s => s.id === SCENARIO_FILTER).length > 0 ? SCENARIOS.filter(s => s.id === SCENARIO_FILTER) : [SCENARIOS[0]]; console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`); if (ROUNDS > 1) console.log(`Toistoja: ${ROUNDS}`); console.log(`Tulokset: ${OUTPUT_DIR}/`); console.log(''); // Puhdista output rmSync(OUTPUT_DIR, { recursive: true, force: true }); mkdirSync(OUTPUT_DIR, { recursive: true }); const results = []; for (let round = 1; round <= ROUNDS; round++) { await clearVram(); if (ROUNDS > 1) console.log(`\n╔═══ Kierros ${round}/${ROUNDS} ═══╗`); for (const model of models) { for (const scenario of scenarios) { const roundLabel = ROUNDS > 1 ? ` [${round}/${ROUNDS}]` : ''; console.log(`\n━━━ ${model} × ${scenario.id}${roundLabel} ━━━`); const r = await runPipeline(model, scenario, round); if (ROUNDS > 1) r.round = round; results.push(r); // Tallenna tulokset jokaisen kierroksen jälkeen writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2)); const status = r.error ? `✗ ${r.error}` : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` : `◐ ${r.testsPassed}/${r.testsTotal}`; const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : ''; console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`); } } // Väliraportti kierroksen jälkeen if (ROUNDS > 1) { const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); }; console.log(`\n┌─── Tilanne kierroksen ${round}/${ROUNDS} jälkeen ───┐`); for (const model of [...new Set(results.map(r => r.model))]) { const mrs = results.filter(r => r.model === model); for (const sid of scenarios.map(s => s.id)) { const runs = mrs.filter(r => r.scenario === sid); if (runs.length === 0) continue; const scores = runs.map(r => r.score); const med = median(scores); const last = scores[scores.length - 1]; const trend = scores.length > 1 ? (last > scores[scores.length - 2] ? '▲' : last < scores[scores.length - 2] ? '▼' : '─') : ''; console.log(`│ ${model.padEnd(28)} ${sid.padEnd(7)} ${starsForScore(med)} med:${String(med).padStart(3)}p [${scores.join(',')}] ${trend}`); } } console.log(`└${'─'.repeat(45)}┘`); } } // rounds // === Tulostaulu === console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗'); console.log('║ TULOKSET ║'); console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣'); const header = [ 'Malli'.padEnd(40), 'Skenaario'.padEnd(10), 'Speksi'.padEnd(8), 'Testit'.padEnd(10), 'Korjaus'.padEnd(8), 'Ctx'.padEnd(7), 'Aika'.padEnd(8), 'tok/s'.padEnd(8), 'Pisteet', ].join(' │ '); console.log(`║ ${header} ║`); console.log('╠' + '═'.repeat(header.length + 2) + '╣'); for (const r of results) { const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗'; const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-'; const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-'; const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-'; const time = `${(r.totalDurationMs/1000).toFixed(0)}s`; const speed = `${r.avgTokPerSec.toFixed(0)}`; const row = [ r.model.padEnd(40), r.scenario.padEnd(10), specStatus.padEnd(8), testStatus.padEnd(10), fixStatus.padEnd(8), ctx.padEnd(7), time.padEnd(8), speed.padEnd(8), `${r.stars} ${r.score}`, ].join(' │ '); console.log(`║ ${row} ║`); } console.log('╚' + '═'.repeat(header.length + 2) + '╝'); // === Mallikohtainen yhteenveto === const modelNames = [...new Set(results.map(r => r.model))]; const scenarioIds = scenarios.map(s => s.id); console.log('\n'); const mHeader = [ 'Malli'.padEnd(35), ...scenarioIds.map(s => s.padEnd(22)), 'Yht.'.padEnd(8), 'Out'.padEnd(7), 'Aika'.padEnd(8), 'tok/s'.padEnd(7), 'Pisteet', ].join(' │ '); console.log(mHeader); console.log('─'.repeat(mHeader.length)); for (const model of modelNames) { const mrs = results.filter(r => r.model === model); const cols = scenarioIds.map(sid => { const r = mrs.find(r => r.scenario === sid); if (!r) return '-'.padEnd(22); const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-'; const s = `${(r.totalDurationMs/1000).toFixed(0)}s`; const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`; return `${t} ${s} ${tok}`.padEnd(22); }); const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0); const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0); const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0); const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0); const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0; const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0; const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0; const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`; const row = [ model.padEnd(35), ...cols, `${totalPassed}/${totalTests}`.padEnd(8), tokStr.padEnd(7), `${(totalTime/1000).toFixed(0)}s`.padEnd(8), `${avgSpeed}`.padEnd(7), `${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`, ].join(' │ '); console.log(row); } // Tallenna JSON + HTML-raportti const jsonData = JSON.stringify(results, null, 2); writeFileSync(`${OUTPUT_DIR}/results.json`, jsonData); const templatePath = join(__dirname, 'report-template.html'); let htmlData = ''; if (existsSync(templatePath)) { htmlData = readFileSync(templatePath, 'utf-8').replace('/*DATA_PLACEHOLDER*/[]', JSON.stringify(results)); writeFileSync(`${OUTPUT_DIR}/report.html`, htmlData); console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`); } console.log(`JSON: ${OUTPUT_DIR}/results.json`); // Kopioi results/-kansioon aikaleimalla mkdirSync(RESULTS_DIR, { recursive: true }); writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.json`), jsonData); if (htmlData) writeFileSync(join(RESULTS_DIR, `${TIMESTAMP}.html`), htmlData); console.log(`Arkistoitu: results/${TIMESTAMP}.json`); // Yhteenveto const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0); const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0); const failed = results.filter(r => r.error || r.testsTotal === 0); const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0; const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0); console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`); // === Kierrosyhteenveto (kun rounds > 1) === if (ROUNDS > 1) { console.log('\n\n╔══════════════════════════════════════════════╗'); console.log('║ KIERROSYHTEENVETO (mediaani) ║'); console.log('╚══════════════════════════════════════════════╝\n'); const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); }; for (const model of modelNames) { const mrs = results.filter(r => r.model === model); for (const sid of scenarioIds) { const runs = mrs.filter(r => r.scenario === sid); if (runs.length === 0) continue; const scores = runs.map(r => r.score); const med = median(scores); const min = Math.min(...scores); const max = Math.max(...scores); const passRates = runs.map(r => r.testsTotal > 0 ? Math.round(r.testsPassed/r.testsTotal*100) : 0); console.log(`${model.padEnd(30)} ${sid.padEnd(8)} ${starsForScore(med)} med:${med}p min:${min} max:${max} pass:[${passRates.join(',')}]%`); } } } } main().catch(e => { console.error(e); process.exit(1); });