diff --git a/kipina-codebench/Dockerfile.cargo-test b/kipina-codebench/Dockerfile.cargo-test new file mode 100644 index 0000000..741b589 --- /dev/null +++ b/kipina-codebench/Dockerfile.cargo-test @@ -0,0 +1,4 @@ +FROM rust:1.87-slim +RUN apt-get update && apt-get install -y pkg-config libssl-dev && rm -rf /var/lib/apt/lists/* +WORKDIR /work +ENTRYPOINT ["sh", "-c", "cp -r /src/* . && cargo test 2>&1"] diff --git a/kipina-codebench/benchmark.mjs b/kipina-codebench/benchmark.mjs index bb09d8b..a8a3e3f 100644 --- a/kipina-codebench/benchmark.mjs +++ b/kipina-codebench/benchmark.mjs @@ -33,6 +33,7 @@ const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 16); const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`); const RESULTS_DIR = join(__dirname, 'results'); const THINK_MODE = args.includes('--think'); +const LANG = arg('lang', 'python'); // python | rust const MAX_FIX_ROUNDS = 2; // === Promptien lataus tiedostoista === @@ -43,18 +44,32 @@ function loadPrompt(name) { } const CLIENT_SYSTEM = loadPrompt('client'); const SPEC_SYSTEM = loadPrompt('spec'); -const CODE_SYSTEM = loadPrompt('code'); +const CODE_SYSTEM = loadPrompt(LANG === 'rust' ? 'code-rs' : 'code'); const FIX_SYSTEM = loadPrompt('fix'); -// === Kultaisten esimerkkien lataus === +// === Kultaisten esimerkkien lataus (kielen mukaan) === const GOLDEN_DIR = join(__dirname, 'golden-examples'); -const GOLDEN_PY_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py']; +const LANG_CONFIG = { + python: { + goldenDir: 'todo', + files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'], + required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'], + dockerImage: 'kipina-pytest', + }, + rust: { + goldenDir: 'todo-rs', + files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'], + required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'], + dockerImage: 'kipina-cargo-test', + }, +}; +const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python; function loadGoldenExample() { - const todoDir = join(GOLDEN_DIR, 'todo'); + const todoDir = join(GOLDEN_DIR, LCONF.goldenDir); if (!existsSync(todoDir)) return ''; - let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n'; - for (const f of GOLDEN_PY_FILES) { + let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`; + for (const f of LCONF.files) { const path = join(todoDir, f); if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`; } @@ -129,14 +144,40 @@ async function ollamaListModels() { return (data.models || []).map(m => m.name); } +// === Testitulosten parsinta (pytest + cargo test) === +function parseTestOutput(output) { + // Pytest: "6 passed", "2 failed", "1 error" + const pyPassed = output.match(/(\d+) passed/); + const pyFailed = output.match(/(\d+) failed/); + const pyError = output.match(/(\d+) error/); + if (pyPassed || pyFailed) { + const passed = pyPassed ? parseInt(pyPassed[1]) : 0; + const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0); + return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed }; + } + // Cargo test: "test result: ok. 10 passed; 0 failed;" + const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/); + if (cargoMatch) { + const passed = parseInt(cargoMatch[1]); + const failed = parseInt(cargoMatch[2]); + return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed }; + } + // Cargo compilation error: count "error[E" occurrences + const compileErrors = (output.match(/error\[E\d+\]/g) || []).length; + if (compileErrors > 0) { + return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors }; + } + return { testsPassed: 0, testsFailed: 0, testsTotal: 0 }; +} + // === Tiedostoparseri LLM-vastauksesta === function parseGeneratedFiles(text) { const files = {}; - const sections = text.split(/===\s*(\S+\.(?:py|toml))\s*===/); + const sections = text.split(/===\s*(\S+\.(?:py|toml|rs))\s*===/); for (let i = 1; i < sections.length - 1; i += 2) { const name = sections[i]; let content = sections[i + 1].trim(); - content = content.replace(/^```(?:python|toml)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim(); + content = content.replace(/^```(?:python|toml|rust)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim(); if (content) files[name] = content + '\n'; } return files; @@ -251,72 +292,70 @@ async function runPipeline(model, scenario) { // 3. LLM-koodigenerointi console.log(` [3/5] Koodigenerointi (LLM)...`); - const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all 4 files. Follow the reference implementation patterns exactly.`; + const fileCount = LCONF.required.length; + const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`; result.promptChars = CODE_SYSTEM.length + codePrompt.length; result.promptTokensEst = Math.round(result.promptChars / 4); - const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, 8192); + const codeTokens = LANG === 'rust' ? 12288 : 8192; + const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens); timings.push(codeResp); writeFileSync(`${dir}/_code_raw.txt`, codeResp.text); const files = parseGeneratedFiles(codeResp.text); - const required = ['models.py', 'schemas.py', 'main.py', 'test_main.py']; - const missing = required.filter(f => !files[f]); + const missing = LCONF.required.filter(f => !files[f]); if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; } - // 4. Validointi + korjaussilmukka - let issues = validateProjectCode(files); + // 4. Validointi + korjaussilmukka (Python-spesifi) let fixRound = 0; - while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) { - fixRound++; - console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`); - const issuesByFile = {}; - for (const issue of issues) { - const m = issue.match(/^ISSUE:\s*(\S+?):/); - const fname = m ? m[1] : 'unknown'; - if (!issuesByFile[fname]) issuesByFile[fname] = []; - issuesByFile[fname].push(issue); - } - for (const [fname, fIssues] of Object.entries(issuesByFile)) { - if (!files[fname]) continue; - const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``; - const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048); - timings.push(fixResp); - if (fixResp.text) { - files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n'; + if (LANG === 'python') { + let issues = validateProjectCode(files); + while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) { + fixRound++; + console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`); + const issuesByFile = {}; + for (const issue of issues) { + const m = issue.match(/^ISSUE:\s*(\S+?):/); + const fname = m ? m[1] : 'unknown'; + if (!issuesByFile[fname]) issuesByFile[fname] = []; + issuesByFile[fname].push(issue); } + for (const [fname, fIssues] of Object.entries(issuesByFile)) { + if (!files[fname]) continue; + const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``; + const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048); + timings.push(fixResp); + if (fixResp.text) { + files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n'; + } + } + issues = validateProjectCode(files); } - issues = validateProjectCode(files); + result.validationIssues = issues.length; } - result.validationIssues = issues.length; result.fixRounds = fixRound; - // Kirjoita LLM:n generoimat Python-tiedostot + // Kirjoita LLM:n generoimat tiedostot (luo src/ ja tests/ alihakemistot tarvittaessa) for (const [fn, content] of Object.entries(files)) { - if (fn.endsWith('.py')) writeFileSync(`${dir}/${fn}`, content); + const filePath = join(dir, fn); + mkdirSync(dirname(filePath), { recursive: true }); + writeFileSync(filePath, content); } - // 5. Pytest Docker-kontissa (kipina-pytest image) - console.log(` [5/5] Pytest (Docker)...`); + // 5. Testit Docker-kontissa + const testLabel = LANG === 'rust' ? 'Cargo test (Docker)' : 'Pytest (Docker)'; + console.log(` [5/5] ${testLabel}...`); + const dockerTimeout = LANG === 'rust' ? 300000 : 120000; try { - const pytestOut = execSync( - `docker run --rm -v "${dir}:/src:ro" kipina-pytest 2>&1`, - { timeout: 120000, encoding: 'utf-8' } + const testOut = execSync( + `docker run --rm -v "${dir}:/src:ro" ${LCONF.dockerImage} 2>&1`, + { timeout: dockerTimeout, encoding: 'utf-8' } ); - writeFileSync(`${dir}/_pytest.txt`, pytestOut); - const passedMatch = pytestOut.match(/(\d+) passed/); - const failedMatch = pytestOut.match(/(\d+) failed/); - result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0; - result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0; - result.testsTotal = result.testsPassed + result.testsFailed; + writeFileSync(`${dir}/_testout.txt`, testOut); + Object.assign(result, parseTestOutput(testOut)); } catch (e) { const output = e.stdout || e.stderr || e.message || ''; - writeFileSync(`${dir}/_pytest.txt`, output); - const passedMatch = output.match(/(\d+) passed/); - const failedMatch = output.match(/(\d+) failed/); - const errorMatch = output.match(/(\d+) error/); - result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0; - result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0); - result.testsTotal = result.testsPassed + result.testsFailed; - if (result.testsTotal === 0) result.error = 'Pytest kaatui'; + writeFileSync(`${dir}/_testout.txt`, output); + Object.assign(result, parseTestOutput(output)); + if (result.testsTotal === 0) result.error = 'Testit kaatuivat'; } } catch (e) { result.error = e.message; @@ -337,7 +376,7 @@ async function main() { console.log('╔══════════════════════════════════════════════╗'); console.log('║ Kipinä CodeBench ║'); console.log('╚══════════════════════════════════════════════╝'); - console.log(`Ollama: ${OLLAMA_URL}${THINK_MODE ? ' 🧠 thinking ON (3× tokens)' : ''}`); + console.log(`Ollama: ${OLLAMA_URL} 📝 ${LANG}${THINK_MODE ? ' 🧠 thinking ON' : ''}`); // Haetaan mallit let models; diff --git a/kipina-codebench/golden-examples/todo-rs/.gitignore b/kipina-codebench/golden-examples/todo-rs/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/kipina-codebench/golden-examples/todo-rs/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/kipina-codebench/prompts/code-rs.md b/kipina-codebench/prompts/code-rs.md new file mode 100644 index 0000000..f63cdb7 --- /dev/null +++ b/kipina-codebench/prompts/code-rs.md @@ -0,0 +1,46 @@ +You are a Rust backend developer. Generate an Axum web project with SQLx and SQLite. + +Given the project requirements, JSON specification, and a REFERENCE IMPLEMENTATION, generate these files: + +1. Cargo.toml — axum 0.8, tokio, serde/serde_json, sqlx (sqlite, runtime-tokio), tower-http +2. src/models.rs — Structs with Serialize, Deserialize, FromRow derives +3. src/handlers.rs — Async handler functions for each CRUD endpoint +4. src/lib.rs — Public app() function returning Router, init_db() for table creation +5. src/main.rs — Binary entry point, connect to SQLite, bind to port +6. tests/api_test.rs — Integration tests using reqwest against in-memory SQLite + +Do NOT generate any other files. + +OUTPUT FORMAT — use these exact markers to separate files: + +=== Cargo.toml === + + +=== src/models.rs === + + +=== src/handlers.rs === + + +=== src/lib.rs === + + +=== src/main.rs === + + +=== tests/api_test.rs === + + +DOCUMENTATION — every file starts with //! one-line module doc. Structs get /// one-line doc. Zensical: say what it IS, not what it does. + +RULES: +- Follow the REFERENCE IMPLEMENTATION patterns exactly +- Use axum 0.8 API: Router, Json, Path, State, StatusCode +- State is SqlitePool wrapped in axum::extract::State +- Handlers return (StatusCode, Json) or StatusCode +- POST returns 201, DELETE returns 204, GET missing returns 404 +- sqlx::query_as for reads, sqlx::query for writes +- Tests: each test spawns isolated server with in-memory SQLite on random port +- Tests: unique descriptive data, NOT generic "test" strings +- NO markdown fences inside file content — just raw code +- Edition 2024 in Cargo.toml diff --git a/kipina-codebench/results/2026-04-14T06-49.html b/kipina-codebench/results/2026-04-14T06-49.html new file mode 100644 index 0000000..e3e7563 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T06-49.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T06-49.json b/kipina-codebench/results/2026-04-14T06-49.json new file mode 100644 index 0000000..8d72498 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T06-49.json @@ -0,0 +1,422 @@ +[ + { + "model": "qwen3.5:9b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 3, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 65901, + "totalTokens": 5056, + "avgTokPerSec": 82.99139473832963, + "promptChars": 12334, + "promptTokensEst": 3084, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen3.5:9b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 1, + "fixRounds": 2, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 74087, + "totalTokens": 5645, + "avgTokPerSec": 83.57073831360164, + "promptChars": 10757, + "promptTokensEst": 2689, + "score": 20, + "stars": "★☆☆☆☆", + "error": null + }, + { + "model": "qwen3.5:9b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 49830, + "totalTokens": 3803, + "avgTokPerSec": 83.26266260763309, + "promptChars": 10826, + "promptTokensEst": 2707, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "gemma4:e4b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 57032, + "totalTokens": 4924, + "avgTokPerSec": 106.02334905805122, + "promptChars": 11313, + "promptTokensEst": 2828, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "gemma4:e4b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 7, + "testsPassed": 5, + "testsFailed": 2, + "totalDurationMs": 54307, + "totalTokens": 5060, + "avgTokPerSec": 106.89447491163497, + "promptChars": 11225, + "promptTokensEst": 2806, + "score": 83, + "stars": "★★★★☆", + "error": null + }, + { + "model": "gemma4:e4b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 11, + "testsPassed": 2, + "testsFailed": 9, + "totalDurationMs": 57080, + "totalTokens": 5310, + "avgTokPerSec": 106.64914988130955, + "promptChars": 11791, + "promptTokensEst": 2948, + "score": 51, + "stars": "★★★☆☆", + "error": null + }, + { + "model": "qwen2.5-coder:3b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 3, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 22377, + "totalTokens": 3534, + "avgTokPerSec": 201.24475679283708, + "promptChars": 11479, + "promptTokensEst": 2870, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen2.5-coder:3b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 8, + "fixRounds": 2, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 44520, + "totalTokens": 7495, + "avgTokPerSec": 201.87149050701015, + "promptChars": 11886, + "promptTokensEst": 2972, + "score": 20, + "stars": "★☆☆☆☆", + "error": null + }, + { + "model": "qwen2.5-coder:3b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 20136, + "totalTokens": 3338, + "avgTokPerSec": 200.86152095722105, + "promptChars": 11228, + "promptTokensEst": 2807, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen2.5-coder:7b", + "scenario": "todo", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui" + }, + { + "model": "qwen2.5-coder:7b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 20012, + "totalTokens": 2119, + "avgTokPerSec": 122.7557304112134, + "promptChars": 10342, + "promptTokensEst": 2586, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen2.5-coder:7b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 26133, + "totalTokens": 2715, + "avgTokPerSec": 121.94987205993503, + "promptChars": 11193, + "promptTokensEst": 2798, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen3:14b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 44757, + "totalTokens": 2156, + "avgTokPerSec": 60.77636586631207, + "promptChars": 9635, + "promptTokensEst": 2409, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:14b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 7, + "testsPassed": 7, + "testsFailed": 0, + "totalDurationMs": 41166, + "totalTokens": 2282, + "avgTokPerSec": 61.14821289733007, + "promptChars": 9575, + "promptTokensEst": 2394, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:14b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 12, + "testsPassed": 12, + "testsFailed": 0, + "totalDurationMs": 66478, + "totalTokens": 3681, + "avgTokPerSec": 60.493817783668725, + "promptChars": 10500, + "promptTokensEst": 2625, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 7, + "testsPassed": 7, + "testsFailed": 0, + "totalDurationMs": 29801, + "totalTokens": 2249, + "avgTokPerSec": 98.5661742189331, + "promptChars": 9615, + "promptTokensEst": 2404, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 8, + "testsPassed": 6, + "testsFailed": 2, + "totalDurationMs": 22974, + "totalTokens": 2050, + "avgTokPerSec": 101.2398768597589, + "promptChars": 9273, + "promptTokensEst": 2318, + "score": 85, + "stars": "★★★★☆", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 12, + "testsPassed": 12, + "testsFailed": 0, + "totalDurationMs": 39335, + "totalTokens": 3537, + "avgTokPerSec": 100.10984073540648, + "promptChars": 10525, + "promptTokensEst": 2631, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:4b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 58668, + "totalTokens": 7134, + "avgTokPerSec": 141.76822189196028, + "promptChars": 15202, + "promptTokensEst": 3801, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:4b", + "scenario": "users", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui" + }, + { + "model": "qwen3:4b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui" + } +] \ No newline at end of file diff --git a/kipina-codebench/results/2026-04-14T07-13.html b/kipina-codebench/results/2026-04-14T07-13.html new file mode 100644 index 0000000..fa868ed --- /dev/null +++ b/kipina-codebench/results/2026-04-14T07-13.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T07-13.json b/kipina-codebench/results/2026-04-14T07-13.json new file mode 100644 index 0000000..2d4fe52 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T07-13.json @@ -0,0 +1,122 @@ +[ + { + "model": "qwen3:14b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 186642, + "totalTokens": 10237, + "avgTokPerSec": 59.06411550065281, + "promptChars": 10576, + "promptTokensEst": 2644, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen3:14b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 121848, + "totalTokens": 6735, + "avgTokPerSec": 59.85231850668119, + "promptChars": 9684, + "promptTokensEst": 2421, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen3:14b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 11, + "testsPassed": 9, + "testsFailed": 2, + "totalDurationMs": 83491, + "totalTokens": 4677, + "avgTokPerSec": 60.222832434869694, + "promptChars": 10423, + "promptTokensEst": 2606, + "score": 89, + "stars": "★★★★☆", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 56288, + "totalTokens": 5235, + "avgTokPerSec": 99.60027546406452, + "promptChars": 9307, + "promptTokensEst": 2327, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 5, + "testsFailed": 1, + "totalDurationMs": 59639, + "totalTokens": 5526, + "avgTokPerSec": 99.6742208632186, + "promptChars": 9158, + "promptTokensEst": 2290, + "score": 90, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 11, + "testsPassed": 10, + "testsFailed": 1, + "totalDurationMs": 131793, + "totalTokens": 11779, + "avgTokPerSec": 97.17878362853351, + "promptChars": 10390, + "promptTokensEst": 2598, + "score": 95, + "stars": "★★★★★", + "error": null + } +] \ No newline at end of file diff --git a/kipina-codebench/results/2026-04-14T07-18.html b/kipina-codebench/results/2026-04-14T07-18.html new file mode 100644 index 0000000..2485e40 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T07-18.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T07-18.json b/kipina-codebench/results/2026-04-14T07-18.json new file mode 100644 index 0000000..91453b3 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T07-18.json @@ -0,0 +1,122 @@ +[ + { + "model": "qwen3:14b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 66903, + "totalTokens": 5454, + "avgTokPerSec": 86.45918994499432, + "promptChars": 9985, + "promptTokensEst": 2496, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen3:14b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 87618, + "totalTokens": 7150, + "avgTokPerSec": 87.21782190501095, + "promptChars": 9922, + "promptTokensEst": 2481, + "score": 40, + "stars": "★★☆☆☆", + "error": null + }, + { + "model": "qwen3:14b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 9, + "testsPassed": 5, + "testsFailed": 4, + "totalDurationMs": 78398, + "totalTokens": 6427, + "avgTokPerSec": 85.52353711143463, + "promptChars": 10737, + "promptTokensEst": 2684, + "score": 73, + "stars": "★★★★☆", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 8, + "testsPassed": 7, + "testsFailed": 1, + "totalDurationMs": 82750, + "totalTokens": 10054, + "avgTokPerSec": 139.90690936146032, + "promptChars": 9360, + "promptTokensEst": 2340, + "score": 93, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 32233, + "totalTokens": 4404, + "avgTokPerSec": 143.4997404058814, + "promptChars": 9310, + "promptTokensEst": 2328, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 88563, + "totalTokens": 11575, + "avgTokPerSec": 141.54675017528362, + "promptChars": 10567, + "promptTokensEst": 2642, + "score": 40, + "stars": "★★☆☆☆", + "error": null + } +] \ No newline at end of file