#!/usr/bin/env node /** * Kipinä Model Benchmark * * Generoi projekteja eri Ollama-malleilla ja testaa niiden toimivuus. * Käyttö: * node model-benchmark.mjs # kaikki mallit, oletusskenaario * node model-benchmark.mjs --models qwen3:8b,qwen3:30b * node model-benchmark.mjs --ollama http://host:11434 * node model-benchmark.mjs --scenarios all # kaikki skenaariot */ import { execSync } from 'child_process'; import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs'; import { dirname, join } from 'path'; import { fileURLToPath } from 'url'; const __dirname = dirname(fileURLToPath(import.meta.url)); // === CLI-argumentit === const args = process.argv.slice(2); function arg(name, fallback) { const i = args.indexOf(`--${name}`); return i >= 0 && args[i + 1] ? args[i + 1] : fallback; } const OLLAMA_URL = arg('ollama', process.env.OLLAMA_URL || 'http://localhost:11434'); const HUB_URL = arg('hub', ''); // Vaihtoehto: --hub https://kipina.studio const FILTER_MODELS = arg('models', ''); const SCENARIO_FILTER = arg('scenarios', 'default'); const OUTPUT_DIR = arg('output', '/tmp/kipina-benchmark'); const MAX_FIX_ROUNDS = 2; // === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) === function stripThinking(text) { return text .replace(/<\|channel>thought[\s\S]*?/g, '') // gemma4 .replace(/[\s\S]*?<\/think>/g, '') // qwen3, qwen3.5 .trim(); } // === Ollama / Hub -client === async function ollamaChat(model, prompt, systemPrompt, maxTokens = 2048) { const start = Date.now(); if (HUB_URL) { // Hub-reitti: /api/v1/chat/completions const taskId = `bench-${Date.now()}-${Math.random().toString(36).slice(2,8)}`; const resp = await fetch(`${HUB_URL}/api/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, prompt, task_id: taskId, system_prompt: systemPrompt, max_tokens: maxTokens }), }); if (!resp.ok) throw new Error(`Hub HTTP ${resp.status}: ${await resp.text()}`); const data = await resp.json(); const elapsed = Date.now() - start; return { text: stripThinking((data.response || '').trim()), tokens: data.tokens_generated || 0, durationMs: elapsed, tokPerSec: data.tokens_per_sec || (data.tokens_generated || 0) / (elapsed / 1000), }; } // Suora Ollama-reitti: /api/chat const messages = []; if (systemPrompt) messages.push({ role: 'system', content: systemPrompt }); messages.push({ role: 'user', content: prompt }); const resp = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, messages, stream: false, options: { num_predict: maxTokens, temperature: 0.7, top_k: 40, repeat_penalty: 1.15 }, }), }); if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}: ${await resp.text()}`); const data = await resp.json(); const elapsed = Date.now() - start; const text = stripThinking((data.message?.content || '').trim()); const evalCount = data.eval_count || 0; const evalDurationNs = data.eval_duration || 1; const tokPerSec = evalCount / (evalDurationNs / 1e9); return { text, tokens: evalCount, durationMs: elapsed, tokPerSec }; } async function ollamaListModels() { const url = HUB_URL ? `${HUB_URL}/api/v1/ollama/tags` : `${OLLAMA_URL}/api/tags`; const resp = await fetch(url); if (!resp.ok) throw new Error(`Tags: HTTP ${resp.status}`); const data = await resp.json(); return (data.models || []).map(m => m.name); } // === Promptit (kopioitu index.astrosta) === const CLIENT_SYSTEM = `You are a product owner who turns vague ideas into clear, actionable software requirements. GIVEN a short project description from the user, produce a structured brief: 1. PROJECT NAME: a short, descriptive name 2. GOAL: one sentence explaining what the software does and who it's for 3. CORE FEATURES: numbered list of 3-8 concrete features (not vague wishes) 4. DATA MODEL: list the main entities and their key fields (include field types) 5. API ENDPOINTS: list the REST endpoints (method + path + purpose) 6. CONSTRAINTS: any technical constraints (e.g. "must use SQLite", "no auth needed") RULES: - Be specific: "User can filter todos by status" not "todo management" - Use plain English, no code - Maximum 400 words total`; const SPEC_SYSTEM = `You are a software architect who designs database schemas for Python web applications. THINK STEP BY STEP before outputting JSON: 1. What are the main ENTITIES (nouns) in this project? 2. What FIELDS does each entity need? (name, type, required?) 3. Which entities REFERENCE each other? (e.g. "a Book belongs to an Author" → Book has author_id) 4. Are there Date/DateTime fields? → add extra_imports Then output ONLY valid JSON (no explanations before or after). SCHEMA: {"project_name":"short-name","description":"One sentence","entities":[{"name":"EntityName","table_name":"entity_names","fields":[{"name":"field_name","sa_type":"String(255)","py_type":"str","nullable":false,"default":null}]}],"relationships":[{"from":"ChildEntity","field":"parent_id","to":"ParentEntity","type":"many-to-one"}],"extra_imports":[]} FIELD RULES: - sa_type: String(N), Text, Integer, Date, DateTime, Boolean, Float - py_type: str, int, float, bool, date, datetime — append " | None" if nullable - Status fields: use String(20) with default value, NEVER Enum - Every entity gets "id" automatically — do NOT add id or redundant ID fields - Use snake_case for field names RELATIONSHIP RULES: - If entity A "belongs to" entity B → A has b_id field (Integer, nullable=false) + relationship entry - EVERY _id field MUST have a matching relationship entry - Parent entities must appear BEFORE children in the entities array - If no relationships, set "relationships": [] AVOID: redundant ID fields, generic names, more than 7 fields or 3 entities, non-English entity/field names (ALWAYS English even if description is Finnish) EXAMPLES (adapt, don't copy): Todo app → Todo: title(str), description(Text|None), due_date(Date|None), status(String20="pending") Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_id→Author, published_at(DateTime|None), status(String20="draft")`; const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.'; // === Kultainen esimerkki === const GOLDEN_DIR = join(__dirname, 'golden-examples', 'todo'); const GOLDEN_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py', 'pyproject.toml']; function loadGoldenExample() { if (!existsSync(GOLDEN_DIR)) return ''; let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n'; for (const f of GOLDEN_FILES) { const path = join(GOLDEN_DIR, f); if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`; } return example; } const GOLDEN_EXAMPLE = loadGoldenExample(); const CODE_SYSTEM = `You are a Python backend developer. Generate a complete FastAPI project with SQLAlchemy and SQLite. Given the project requirements, JSON specification, and a REFERENCE IMPLEMENTATION, generate these 5 files: 1. models.py — SQLAlchemy 2.0: DeclarativeBase, Mapped, mapped_column (NOT legacy declarative_base) 2. schemas.py — Pydantic v2: ConfigDict(from_attributes=True) (NOT class Config) 3. main.py — FastAPI CRUD endpoints for each entity 4. test_main.py — Pytest with TestClient, separate test.db, unique test data per test 5. pyproject.toml — PEP 621 [project] format (NOT [tool.poetry]) OUTPUT FORMAT — use these exact markers to separate files: === models.py === === schemas.py === === main.py === === test_main.py === === pyproject.toml === DOCUMENTATION — every file must have a one-line module docstring. Classes get a one-line docstring. Keep it zensical: say what it IS, not what it does. No filler. RULES: - Follow the REFERENCE IMPLEMENTATION patterns exactly - SQLAlchemy 2.0: DeclarativeBase + Mapped + mapped_column (not Column()) - Python type unions: str | None (not Optional[str]) - pyproject.toml: PEP 621 [project] format, requires-python = ">=3.14" - Tests: unique descriptive data per test, NOT generic "test_title" strings - Absolute imports only (from models import ..., from schemas import ...) - NO markdown fences inside file content — just raw code - Only test endpoints that exist in main.py — no extra tests`; // === Tiedostoparseri LLM-vastauksesta === function parseGeneratedFiles(text) { const files = {}; const sections = text.split(/===\s*(\S+\.(?:py|toml))\s*===/); // sections: [preamble, filename1, content1, filename2, content2, ...] for (let i = 1; i < sections.length - 1; i += 2) { const name = sections[i]; let content = sections[i + 1].trim(); // Poista mahdolliset markdown-aidat content = content.replace(/^```(?:python|toml)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim(); if (content) files[name] = content + '\n'; } return files; } // === Validaattori === function validateProjectCode(files) { const issues = []; for (const [fname, code] of Object.entries(files)) { if (!fname.endsWith('.py')) continue; const lines = code.split('\n'); for (const line of lines) { const m = line.match(/^from\s+\.(\w*)\s+import/); if (m) issues.push(`ISSUE: ${fname}: relatiivinen import`); } for (const line of lines) { const m = line.match(/^from\s+(models|schemas|main)\s+import\s+(.+)/); if (!m) continue; const srcCode = files[m[1] + '.py']; if (!srcCode) { issues.push(`ISSUE: ${fname}: ${m[1]}.py puuttuu`); continue; } const names = m[2].split(',').map(n => n.trim().split(/\s+as\s+/)[0].trim()); for (const name of names) { if (name && !srcCode.includes(name)) issues.push(`ISSUE: ${fname}: "${name}" puuttuu ${m[1]}.py:stä`); } } if (fname === 'schemas.py') { if (/:\s*date\b/.test(code) && !/from datetime import/.test(code)) issues.push('ISSUE: schemas.py: date-import puuttuu'); if (/:\s*datetime\b/.test(code) && !/from datetime import/.test(code)) issues.push('ISSUE: schemas.py: datetime-import puuttuu'); } for (let i = 0; i < lines.length; i++) { const line = lines[i]; if (/^\s*#/.test(line) || /^\s*$/.test(line)) continue; if (/(? !files[f]); if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; } // 4. Validointi + korjaussilmukka let issues = validateProjectCode(files); let fixRound = 0; while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) { fixRound++; console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`); const issuesByFile = {}; for (const issue of issues) { const m = issue.match(/^ISSUE:\s*(\S+?):/); const fname = m ? m[1] : 'unknown'; if (!issuesByFile[fname]) issuesByFile[fname] = []; issuesByFile[fname].push(issue); } for (const [fname, fIssues] of Object.entries(issuesByFile)) { if (!files[fname]) continue; const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``; const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048); timings.push(fixResp); if (fixResp.text) { files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n'; } } issues = validateProjectCode(files); } result.validationIssues = issues.length; result.fixRounds = fixRound; // Korjaa pyproject.toml jos malli generoi Poetry-muodon if (files['pyproject.toml'] && !files['pyproject.toml'].includes('[project]')) { const goldenPyproject = join(GOLDEN_DIR, 'pyproject.toml'); if (existsSync(goldenPyproject)) { const nameMatch = files['pyproject.toml'].match(/name\s*=\s*"([^"]+)"/); const name = nameMatch ? nameMatch[1] : 'generated-app'; files['pyproject.toml'] = readFileSync(goldenPyproject, 'utf-8').replace(/name = "[^"]+"/, `name = "${name}"`); } } // Kirjoita tiedostot levylle for (const [fn, content] of Object.entries(files)) writeFileSync(`${dir}/${fn}`, content); // 5. Pytest console.log(` [5/5] Pytest...`); try { const uvPath = process.env.HOME + '/.local/bin/uv'; const uv = existsSync(uvPath) ? uvPath : 'uv'; execSync(`cd "${dir}" && ${uv} sync 2>/dev/null`, { timeout: 60000, stdio: 'pipe' }); execSync(`cd "${dir}" && rm -f app.db test.db`, { stdio: 'pipe' }); const pytestOut = execSync(`cd "${dir}" && ${uv} run pytest test_main.py -v --tb=short 2>&1`, { timeout: 60000, encoding: 'utf-8' }); writeFileSync(`${dir}/_pytest.txt`, pytestOut); const passedMatch = pytestOut.match(/(\d+) passed/); const failedMatch = pytestOut.match(/(\d+) failed/); result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0; result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0; result.testsTotal = result.testsPassed + result.testsFailed; } catch (e) { const output = e.stdout || e.stderr || e.message || ''; writeFileSync(`${dir}/_pytest.txt`, output); const passedMatch = output.match(/(\d+) passed/); const failedMatch = output.match(/(\d+) failed/); const errorMatch = output.match(/(\d+) error/); result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0; result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0); result.testsTotal = result.testsPassed + result.testsFailed; if (result.testsTotal === 0) result.error = 'Pytest kaatui'; } } catch (e) { result.error = e.message; } // Yhteenveto result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0); result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0); result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0; return result; } // === Main === async function main() { console.log('╔══════════════════════════════════════════════╗'); console.log('║ Kipinä Model Benchmark ║'); console.log('╚══════════════════════════════════════════════╝'); console.log(`Ollama: ${OLLAMA_URL}`); // Haetaan mallit let models; try { models = await ollamaListModels(); } catch (e) { console.error(`Ei yhteyttä Ollamaan (${OLLAMA_URL}): ${e.message}`); process.exit(1); } if (FILTER_MODELS) { const filter = FILTER_MODELS.split(',').map(s => s.trim()); models = models.filter(m => filter.some(f => m.includes(f))); } console.log(`Mallit (${models.length}): ${models.join(', ')}`); const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]]; console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`); console.log(`Tulokset: ${OUTPUT_DIR}/`); console.log(''); // Puhdista output rmSync(OUTPUT_DIR, { recursive: true, force: true }); mkdirSync(OUTPUT_DIR, { recursive: true }); const results = []; for (const model of models) { for (const scenario of scenarios) { console.log(`\n━━━ ${model} × ${scenario.id} ━━━`); const r = await runPipeline(model, scenario); results.push(r); const status = r.error ? `✗ ${r.error}` : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` : `◐ ${r.testsPassed}/${r.testsTotal}`; console.log(` → ${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s`); } } // === Tulostaulu === console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗'); console.log('║ TULOKSET ║'); console.log('╠══════════════════════════════════════════════════════════════════════════════════════════════════╣'); const header = [ 'Malli'.padEnd(40), 'Skenaario'.padEnd(10), 'Speksi'.padEnd(8), 'Testit'.padEnd(10), 'Korjaus'.padEnd(8), 'Aika'.padEnd(8), 'tok/s'.padEnd(8), 'Tulos', ].join(' │ '); console.log(`║ ${header} ║`); console.log('╠' + '═'.repeat(header.length + 2) + '╣'); for (const r of results) { const specStatus = r.specOk ? `✓ ${r.specEntities}e` : '✗'; const testStatus = r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` : '-'; const fixStatus = r.fixRounds > 0 ? `${r.fixRounds}×` : '-'; const time = `${(r.totalDurationMs/1000).toFixed(0)}s`; const speed = `${r.avgTokPerSec.toFixed(0)}`; const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL'; const row = [ r.model.padEnd(40), r.scenario.padEnd(10), specStatus.padEnd(8), testStatus.padEnd(10), fixStatus.padEnd(8), time.padEnd(8), speed.padEnd(8), verdict, ].join(' │ '); console.log(`║ ${row} ║`); } console.log('╚' + '═'.repeat(header.length + 2) + '╝'); // Tallenna JSON writeFileSync(`${OUTPUT_DIR}/results.json`, JSON.stringify(results, null, 2)); console.log(`\nJSON: ${OUTPUT_DIR}/results.json`); // Yhteenveto const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0); const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0); const failed = results.filter(r => r.error || r.testsTotal === 0); console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`); } main().catch(e => { console.error(e); process.exit(1); });