#!/usr/bin/env node /** * Kipinä Model Benchmark * * Generoi projekteja eri Ollama-malleilla ja testaa niiden toimivuus. * Käyttö: * node model-benchmark.mjs # kaikki mallit, oletusskenaario * node model-benchmark.mjs --models qwen3:8b,qwen3:30b * node model-benchmark.mjs --ollama http://host:11434 * node model-benchmark.mjs --scenarios all # kaikki skenaariot */ import { execSync } from 'child_process'; import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs'; import { dirname, join } from 'path'; import { fileURLToPath } from 'url'; const __dirname = dirname(fileURLToPath(import.meta.url)); // === CLI-argumentit === const args = process.argv.slice(2); function arg(name, fallback) { const i = args.indexOf(`--${name}`); return i >= 0 && args[i + 1] ? args[i + 1] : fallback; } const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434'); const HUB_URL = arg('hub', ''); // Vaihtoehto: --hub https://kipina.studio const FILTER_MODELS = arg('models', ''); const SCENARIO_FILTER = arg('scenarios', 'default'); const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark'); const MAX_FIX_ROUNDS = 2; // === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) === function stripThinking(text) { return text .replace(/<\|channel>thought[\s\S]*?/g, '') // gemma4 .replace(/[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5 .trim(); } // === Ollama / Hub -client === async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) { const start = Date.now(); if (HUB_URL) { // Hub-reitti: /api/v1/chat/completions const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`; const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }), }); if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`); const data = await resp.json(); const elapsed = Date.now() - start; return { text: stripThinking((data.response || '').trim()), tokens: data.tokens_generated || 0, durationMs: elapsed, tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000), }; } // Suora Ollama-reitti: /api/chat const messages = []; if (systemPrompt) messages.push({ role: 'system', content: systemPrompt }); messages.push({ role: 'user', content: prompt }); const resp = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, messages, stream: false, think: false, options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 }, }), }); if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`); const data = await resp.json(); const elapsed = Date.now() - start; // Ollama: jotkin mallit (qwen3.5) palauttavat ajattelun erillisessä thinking-kentässä const rawContent = (data.message?.content || '').trim(); const thinking = (data.message?.thinking || '').trim(); const text = stripThinking(rawContent || thinking); const evalCount = data.eval_count || 0; if (!rawContent && thinking) console.log(` ⚠ thinking-malli: ${thinking.length} merkkiä ajattelua, content tyhjä`); const evalDurationNs = data.eval_duration || 1; const tokPerSec = evalCount / (evalDurationNs / 1e9); return { text, tokens: evalCount, durationMs: elapsed, tokPerSec }; } async function ollamaListModels() { const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`; const resp = await fetch(url); if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`); const data = await resp.json(); return (data.models || []).map(m => m.name); } // === Promptit (kopioitu index.astrosta) === const CLIENT_SYSTEM = `You are a product owner who turns vague ideas into clear, actionable software requirements. GIVEN a short project description from the user, produce a structured brief: 1. PROJECT NAME: a short, descriptive name 2. GOAL: one sentence explaining what the software does and who it's for 3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes) 4. DATA MODEL: list the main entities and their key fields (include field types) 5. API ENDPOINTS: list the REST endpoints (method + path + purpose) 6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed") RULES: - Be specific: "User can filter todos by status" not "todo management" - Use plain English, no code - Maximum 400 words total`; const SPEC_SYSTEM = `You are a software architect who designs database schemas for Python web applications. THINK STEP BY STEP before outputting JSON: 1. What are the main ENTITIES (nouns) in this project? 2. What FIELDS does each entity need? (name, type, required?) 3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id) 4. Are there Date/DateTime fields? → add extra_imports Then output ONLY valid JSON (no explanations before or after). SCHEMA: {"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]} FIELD RULES: - sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float - py_type: str, int, float, bool, date, datetime — append " | None" if nullable - Status fields: use String(20) with default value, NEVER Enum - Every entity gets "id" automatically — do NOT add id or redundant ID fields - Use snake_case for field names RELATIONSHIP RULES: - If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry - EVERY _id field MUST have a matching relationship entry - Parent entities must appear BEFORE children in the entities array - If no relationships, set "relationships": [] AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish) EXAMPLES (adapt, don't copy): Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending") Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")`; const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.'; // === Kultainen esimerkki === const GOLDEN_DIR = join(__dirname, 'golden-examples', 'todo'); const GOLDEN_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py']; function loadGoldenExample() { if (!existsSync(GOLDEN_DIR)) return ''; let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n'; for (const f of GOLDEN_FILES) { const path = join(GOLDEN_DIR, f); if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`; } return example; } const GOLDEN_EXAMPLE = loadGoldenExample(); const CODE_SYSTEM = `You are a Python backend developer. Generate a FastAPI project with SQLAlchemy and SQLite. Given the project requirements, JSON specification, and a REFERENCE IMPLEMENTATION, generate these 4 files: 1. models.py — SQLAlchemy 2.0: DeclarativeBase, Mapped, mapped_column (NOT legacy declarative_base) 2. schemas.py — Pydantic v2: ConfigDict(from_attributes=True) (NOT class Config) 3. main.py — FastAPI CRUD endpoints for each entity 4. test_main.py — Pytest with TestClient, separate test.db, unique test data per test Do NOT generate pyproject.toml — it is created separately with uv. OUTPUT FORMAT — use these exact markers to separate files: === models.py === === schemas.py === === main.py === === test_main.py === DOCUMENTATION — every file must have a one-line module docstring. Classes get a one-line docstring. Keep it zensical: say what it IS, not what it does. No filler. RULES: - Follow the REFERENCE IMPLEMENTATION patterns exactly - SQLAlchemy 2.0: DeclarativeBase + Mapped + mapped_column (not Column()) - Python type unions: str | None (not Optional[str]) - Tests: unique descriptive data per test, NOT generic "test_title" strings - Tests: PUT/update test data MUST include ALL required (non-nullable) fields, not just the field being updated - Absolute imports only (from models import ..., from schemas import ...) - NO markdown fences inside file content — just raw code - Only test endpoints that exist in main.py — no extra tests`; // === Tiedostoparseri LLM-vastauksesta === function parseGeneratedFiles(text) { const files = {}; const sections = text.split(/===\s*(\S+\.(?:py|toml))\s*===/); // sections: [preamble, filename1, content1, filename2, content2, ...] for (let i = 1; i < sections.length - 1; i += 2) { const name = sections[i]; let content = sections[i + 1].trim(); // Poista mahdolliset markdown-aidat content = content.replace(/^```(?:python|toml)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim(); if (content) files[name] = content + '\n'; } return files; } // === Validaattori === function validateProjectCode(files) { const issues = []; for (const [fname, code] of Object.entries(files)) { if (!fname.endsWith('.py')) continue; const lines = code.split('\n'); for (const line of lines) { const m = line.match(/^from\s+\.(\w*)\s+import/); if (m) issues.push(`ISSUE: ${fname}: relatiivinen import`); } for (const line of lines) { const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/); if (!m) continue; const srcCode = files[m[1] + '.py']; if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; } const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim()); for (const name of names) { if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`); } } if (fname === 'schemas.py') { if (/:\s*date\b/.test(code) && !/from datetime import/.test(code)) issues.push('ISSUE: schemas.py: date-import puuttuu'); if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code)) issues.push('ISSUE: schemas.py: datetime-import puuttuu'); } for (let i = 0; i < lines.length; i++) { const line = lines[i]; if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue; if (/(? !files[f]); if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; } // 4. Validointi + korjaussilmukka let issues = validateProjectCode(files); let fixRound = 0; while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) { fixRound++; console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`); const issuesByFile = {}; for (const issue of issues) { const m = issue.match(/^ISSUE:\s*(\S+?):/); const fname = m ? m[1] : 'unknown'; if (!issuesByFile[fname]) issuesByFile[fname] = []; issuesByFile[fname].push(issue); } for (const [fname, fIssues] of Object.entries(issuesByFile)) { if (!files[fname]) continue; const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``; const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048); timings.push(fixResp); if (fixResp.text) { files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n'; } } issues = validateProjectCode(files); } result.validationIssues = issues.length; result.fixRounds = fixRound; // Kirjoita LLM:n generoimat Python-tiedostot for (const [fn, content] of Object.entries(files)) { if (fn.endsWith('.py')) writeFileSync(`${dir}/${fn}`, content); } // 5. Pytest Docker-kontissa (kipina-pytest image) console.log(` [5/5] Pytest (Docker)...`); try { const pytestOut = execSync( `docker run --rm -v "${dir}:/src:ro" kipina-pytest 2>&1`, { timeout: 120000, encoding: 'utf-8' } ); writeFileSync(`${dir}/_pytest.txt`, pytestOut); const passedMatch = pytestOut.match(/(\d+) passed/); const failedMatch = pytestOut.match(/(\d+) failed/); result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0; result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0; result.testsTotal = result.testsPassed + result.testsFailed; } catch (e) { const output = e.stdout || e.stderr || e.message || ''; writeFileSync(`${dir}/_pytest.txt`, output); const passedMatch = output.match(/(\d+) passed/); const failedMatch = output.match(/(\d+) failed/); const errorMatch = output.match(/(\d+) error/); result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0; result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0); result.testsTotal = result.testsPassed + result.testsFailed; if (result.testsTotal === 0) result.error = 'Pytest kaatui'; } } catch (e) { result.error = e.message; } // Yhteenveto result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0); result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0); result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0; result.score = scoreResult(result); result.stars = starsForScore(result.score); return result; } // === Pisteytys (0–100) ja tähtiluokitus === function scoreResult(r) { if (r.error && r.testsTotal === 0) return 0; let score = 0; // Speksi onnistui (10p) if (r.specOk) score += 10; // Koodi generoitu (10p) if (!r.error || r.testsTotal > 0) score += 10; // Testien läpäisy (60p) if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60); // Korjauskierrokset (20p: 0×=20, 1×=10, 2×=0) score += Math.max(0, 20 - r.fixRounds * 10); return Math.min(100, score); } function starsForScore(score) { if (score >= 90) return '★★★★★'; if (score >= 70) return '★★★★☆'; if (score >= 50) return '★★★☆☆'; if (score >= 25) return '★★☆☆☆'; if (score > 0) return '★☆☆☆☆'; return '☆☆☆☆☆'; } // === Main === async function main() { console.log('╔══════════════════════════════════════════════╗'); console.log('║ Kipinä Model Benchmark ║'); console.log('╚══════════════════════════════════════════════╝'); console.log(`Ollama: ${OLLAMA_URL}`); // Haetaan mallit let models; try { models = await ollamaListModels(); } catch (e) { console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`); process.exit(1); } if (FILTER_MODELS) { const filter = FILTER_MODELS.split(',').map(s => s.trim()); models = models.filter(m => filter.some(f => m.includes(f))); } console.log(`Mallit (${models.length}): ${models.join(', ')}`); const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]]; console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`); console.log(`Tulokset: ${OUTPUT_DIR}/`); console.log(''); // Puhdista output rmSync(OUTPUT_DIR, { recursive: true, force: true }); mkdirSync(OUTPUT_DIR, { recursive: true }); const results = []; for (const model of models) { for (const scenario of scenarios) { console.log(`\n━━━ ${model} × ${scenario.id} ━━━`); const r = await runPipeline(model, scenario); results.push(r); const status = r.error ? `✗ ${r.error}` : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` : `◐ ${r.testsPassed}/${r.testsTotal}`; const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : ''; console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`); } } // === Tulostaulu === console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗'); console.log('║ TULOKSET ║'); console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣'); const header = [ 'Malli'.padEnd(40), 'Skenaario'.padEnd(10), 'Speksi'.padEnd(8), 'Testit'.padEnd(10), 'Korjaus'.padEnd(8), 'Ctx'.padEnd(7), 'Aika'.padEnd(8), 'tok/s'.padEnd(8), 'Pisteet', ].join(' │ '); console.log(`║ ${header} ║`); console.log('╠' + '═'.repeat(header.length + 2) + '╣'); for (const r of results) { const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗'; const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-'; const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-'; const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-'; const time = `${(r.totalDurationMs/1000).toFixed(0)}s`; const speed = `${r.avgTokPerSec.toFixed(0)}`; const row = [ r.model.padEnd(40), r.scenario.padEnd(10), specStatus.padEnd(8), testStatus.padEnd(10), fixStatus.padEnd(8), ctx.padEnd(7), time.padEnd(8), speed.padEnd(8), `${r.stars} ${r.score}`, ].join(' │ '); console.log(`║ ${row} ║`); } console.log('╚' + '═'.repeat(header.length + 2) + '╝'); // === Mallikohtainen yhteenveto === const modelNames = [...new Set(results.map(r => r.model))]; const scenarioIds = scenarios.map(s => s.id); console.log('\n'); const mHeader = [ 'Malli'.padEnd(35), ...scenarioIds.map(s => s.padEnd(22)), 'Yht.'.padEnd(8), 'Out'.padEnd(7), 'Aika'.padEnd(8), 'tok/s'.padEnd(7), 'Pisteet', ].join(' │ '); console.log(mHeader); console.log('─'.repeat(mHeader.length)); for (const model of modelNames) { const mrs = results.filter(r => r.model === model); const cols = scenarioIds.map(sid => { const r = mrs.find(r => r.scenario === sid); if (!r) return '-'.padEnd(22); const t = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-'; const s = `${(r.totalDurationMs/1000).toFixed(0)}s`; const tok = r.totalTokens > 1000 ? `${(r.totalTokens/1000).toFixed(1)}K` : `${r.totalTokens}`; return `${t} ${s} ${tok}`.padEnd(22); }); const totalPassed = mrs.reduce((s, r) => s + r.testsPassed, 0); const totalTests = mrs.reduce((s, r) => s + r.testsTotal, 0); const totalTokens = mrs.reduce((s, r) => s + r.totalTokens, 0); const totalTime = mrs.reduce((s, r) => s + r.totalDurationMs, 0); const avgSpeed = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.avgTokPerSec, 0) / mrs.length) : 0; const avgScoreModel = mrs.length > 0 ? Math.round(mrs.reduce((s, r) => s + r.score, 0) / mrs.length) : 0; const pct = totalTests > 0 ? Math.round(totalPassed / totalTests * 100) : 0; const tokStr = totalTokens > 1000 ? `${(totalTokens/1000).toFixed(1)}K` : `${totalTokens}`; const row = [ model.padEnd(35), ...cols, `${totalPassed}/${totalTests}`.padEnd(8), tokStr.padEnd(7), `${(totalTime/1000).toFixed(0)}s`.padEnd(8), `${avgSpeed}`.padEnd(7), `${starsForScore(avgScoreModel)} ${avgScoreModel}p (${pct}%)`, ].join(' │ '); console.log(row); } // Tallenna JSON + HTML-raportti writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2)); const templatePath = join(__dirname, 'report-template.html'); if (existsSync(templatePath)) { const html = readFileSync(templatePath, 'utf-8').replace( '/*DATA_PLACEHOLDER*/[]', JSON.stringify(results) ); writeFileSync(`${OUTPUT_DIR}/report.html`, html); console.log(`\nRaportti: ${OUTPUT_DIR}/report.html`); } console.log(`JSON: ${OUTPUT_DIR}/results.json`); // Yhteenveto const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0); const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0); const failed = results.filter(r => r.error || r.testsTotal === 0); const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0; const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0); console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`); } main().catch(e => { console.error(e); process.exit(1); });