CodeBench: Rust-tuki (--lang rust), golden example todo-rs, Dockerfile.cargo-test
- golden-examples/todo-rs/: Axum 0.8 + SQLx + SQLite, 10 testiä - prompts/code-rs.md: Rust-koodingenerointiprompt - Dockerfile.cargo-test: rust:1.87-slim testikontti - benchmark.mjs: --lang python|rust, kieliriippuvainen golden example, parseri tukee cargo test -tuloksia, src/ alihakemistot
This commit is contained in:
4
kipina-codebench/Dockerfile.cargo-test
Normal file
4
kipina-codebench/Dockerfile.cargo-test
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
FROM rust:1.87-slim
|
||||||
|
RUN apt-get update && apt-get install -y pkg-config libssl-dev && rm -rf /var/lib/apt/lists/*
|
||||||
|
WORKDIR /work
|
||||||
|
ENTRYPOINT ["sh", "-c", "cp -r /src/* . && cargo test 2>&1"]
|
||||||
@@ -33,6 +33,7 @@ const TIMESTAMP = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 16);
|
|||||||
const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`);
|
const OUTPUT_DIR = arg('output', `/tmp/kipina-benchmark/${TIMESTAMP}`);
|
||||||
const RESULTS_DIR = join(__dirname, 'results');
|
const RESULTS_DIR = join(__dirname, 'results');
|
||||||
const THINK_MODE = args.includes('--think');
|
const THINK_MODE = args.includes('--think');
|
||||||
|
const LANG = arg('lang', 'python'); // python | rust
|
||||||
const MAX_FIX_ROUNDS = 2;
|
const MAX_FIX_ROUNDS = 2;
|
||||||
|
|
||||||
// === Promptien lataus tiedostoista ===
|
// === Promptien lataus tiedostoista ===
|
||||||
@@ -43,18 +44,32 @@ function loadPrompt(name) {
|
|||||||
}
|
}
|
||||||
const CLIENT_SYSTEM = loadPrompt('client');
|
const CLIENT_SYSTEM = loadPrompt('client');
|
||||||
const SPEC_SYSTEM = loadPrompt('spec');
|
const SPEC_SYSTEM = loadPrompt('spec');
|
||||||
const CODE_SYSTEM = loadPrompt('code');
|
const CODE_SYSTEM = loadPrompt(LANG === 'rust' ? 'code-rs' : 'code');
|
||||||
const FIX_SYSTEM = loadPrompt('fix');
|
const FIX_SYSTEM = loadPrompt('fix');
|
||||||
|
|
||||||
// === Kultaisten esimerkkien lataus ===
|
// === Kultaisten esimerkkien lataus (kielen mukaan) ===
|
||||||
const GOLDEN_DIR = join(__dirname, 'golden-examples');
|
const GOLDEN_DIR = join(__dirname, 'golden-examples');
|
||||||
const GOLDEN_PY_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
|
const LANG_CONFIG = {
|
||||||
|
python: {
|
||||||
|
goldenDir: 'todo',
|
||||||
|
files: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
|
||||||
|
required: ['models.py', 'schemas.py', 'main.py', 'test_main.py'],
|
||||||
|
dockerImage: 'kipina-pytest',
|
||||||
|
},
|
||||||
|
rust: {
|
||||||
|
goldenDir: 'todo-rs',
|
||||||
|
files: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
|
||||||
|
required: ['Cargo.toml', 'src/models.rs', 'src/handlers.rs', 'src/lib.rs', 'src/main.rs', 'tests/api_test.rs'],
|
||||||
|
dockerImage: 'kipina-cargo-test',
|
||||||
|
},
|
||||||
|
};
|
||||||
|
const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python;
|
||||||
|
|
||||||
function loadGoldenExample() {
|
function loadGoldenExample() {
|
||||||
const todoDir = join(GOLDEN_DIR, 'todo');
|
const todoDir = join(GOLDEN_DIR, LCONF.goldenDir);
|
||||||
if (!existsSync(todoDir)) return '';
|
if (!existsSync(todoDir)) return '';
|
||||||
let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n';
|
let example = `\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n`;
|
||||||
for (const f of GOLDEN_PY_FILES) {
|
for (const f of LCONF.files) {
|
||||||
const path = join(todoDir, f);
|
const path = join(todoDir, f);
|
||||||
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
|
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
|
||||||
}
|
}
|
||||||
@@ -129,14 +144,40 @@ async function ollamaListModels() {
|
|||||||
return (data.models || []).map(m => m.name);
|
return (data.models || []).map(m => m.name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// === Testitulosten parsinta (pytest + cargo test) ===
|
||||||
|
function parseTestOutput(output) {
|
||||||
|
// Pytest: "6 passed", "2 failed", "1 error"
|
||||||
|
const pyPassed = output.match(/(\d+) passed/);
|
||||||
|
const pyFailed = output.match(/(\d+) failed/);
|
||||||
|
const pyError = output.match(/(\d+) error/);
|
||||||
|
if (pyPassed || pyFailed) {
|
||||||
|
const passed = pyPassed ? parseInt(pyPassed[1]) : 0;
|
||||||
|
const failed = (pyFailed ? parseInt(pyFailed[1]) : 0) + (pyError ? parseInt(pyError[1]) : 0);
|
||||||
|
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
|
||||||
|
}
|
||||||
|
// Cargo test: "test result: ok. 10 passed; 0 failed;"
|
||||||
|
const cargoMatch = output.match(/test result: \w+\.\s*(\d+) passed;\s*(\d+) failed/);
|
||||||
|
if (cargoMatch) {
|
||||||
|
const passed = parseInt(cargoMatch[1]);
|
||||||
|
const failed = parseInt(cargoMatch[2]);
|
||||||
|
return { testsPassed: passed, testsFailed: failed, testsTotal: passed + failed };
|
||||||
|
}
|
||||||
|
// Cargo compilation error: count "error[E" occurrences
|
||||||
|
const compileErrors = (output.match(/error\[E\d+\]/g) || []).length;
|
||||||
|
if (compileErrors > 0) {
|
||||||
|
return { testsPassed: 0, testsFailed: compileErrors, testsTotal: compileErrors };
|
||||||
|
}
|
||||||
|
return { testsPassed: 0, testsFailed: 0, testsTotal: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
// === Tiedostoparseri LLM-vastauksesta ===
|
// === Tiedostoparseri LLM-vastauksesta ===
|
||||||
function parseGeneratedFiles(text) {
|
function parseGeneratedFiles(text) {
|
||||||
const files = {};
|
const files = {};
|
||||||
const sections = text.split(/===\s*(\S+\.(?:py|toml))\s*===/);
|
const sections = text.split(/===\s*(\S+\.(?:py|toml|rs))\s*===/);
|
||||||
for (let i = 1; i < sections.length - 1; i += 2) {
|
for (let i = 1; i < sections.length - 1; i += 2) {
|
||||||
const name = sections[i];
|
const name = sections[i];
|
||||||
let content = sections[i + 1].trim();
|
let content = sections[i + 1].trim();
|
||||||
content = content.replace(/^```(?:python|toml)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
|
content = content.replace(/^```(?:python|toml|rust)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim();
|
||||||
if (content) files[name] = content + '\n';
|
if (content) files[name] = content + '\n';
|
||||||
}
|
}
|
||||||
return files;
|
return files;
|
||||||
@@ -251,72 +292,70 @@ async function runPipeline(model, scenario) {
|
|||||||
|
|
||||||
// 3. LLM-koodigenerointi
|
// 3. LLM-koodigenerointi
|
||||||
console.log(` [3/5] Koodigenerointi (LLM)...`);
|
console.log(` [3/5] Koodigenerointi (LLM)...`);
|
||||||
const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all 4 files. Follow the reference implementation patterns exactly.`;
|
const fileCount = LCONF.required.length;
|
||||||
|
const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`;
|
||||||
result.promptChars = CODE_SYSTEM.length + codePrompt.length;
|
result.promptChars = CODE_SYSTEM.length + codePrompt.length;
|
||||||
result.promptTokensEst = Math.round(result.promptChars / 4);
|
result.promptTokensEst = Math.round(result.promptChars / 4);
|
||||||
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, 8192);
|
const codeTokens = LANG === 'rust' ? 12288 : 8192;
|
||||||
|
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, codeTokens);
|
||||||
timings.push(codeResp);
|
timings.push(codeResp);
|
||||||
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
|
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);
|
||||||
const files = parseGeneratedFiles(codeResp.text);
|
const files = parseGeneratedFiles(codeResp.text);
|
||||||
const required = ['models.py', 'schemas.py', 'main.py', 'test_main.py'];
|
const missing = LCONF.required.filter(f => !files[f]);
|
||||||
const missing = required.filter(f => !files[f]);
|
|
||||||
if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
|
if (missing.length > 0) { result.error = `Puuttuvat: ${missing.join(', ')}`; return result; }
|
||||||
|
|
||||||
// 4. Validointi + korjaussilmukka
|
// 4. Validointi + korjaussilmukka (Python-spesifi)
|
||||||
let issues = validateProjectCode(files);
|
|
||||||
let fixRound = 0;
|
let fixRound = 0;
|
||||||
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
|
if (LANG === 'python') {
|
||||||
fixRound++;
|
let issues = validateProjectCode(files);
|
||||||
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
|
while (issues.length > 0 && fixRound < MAX_FIX_ROUNDS) {
|
||||||
const issuesByFile = {};
|
fixRound++;
|
||||||
for (const issue of issues) {
|
console.log(` [4/5] Korjauskierros ${fixRound} (${issues.length} ongelmaa)...`);
|
||||||
const m = issue.match(/^ISSUE:\s*(\S+?):/);
|
const issuesByFile = {};
|
||||||
const fname = m ? m[1] : 'unknown';
|
for (const issue of issues) {
|
||||||
if (!issuesByFile[fname]) issuesByFile[fname] = [];
|
const m = issue.match(/^ISSUE:\s*(\S+?):/);
|
||||||
issuesByFile[fname].push(issue);
|
const fname = m ? m[1] : 'unknown';
|
||||||
}
|
if (!issuesByFile[fname]) issuesByFile[fname] = [];
|
||||||
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
|
issuesByFile[fname].push(issue);
|
||||||
if (!files[fname]) continue;
|
|
||||||
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
|
|
||||||
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
|
|
||||||
timings.push(fixResp);
|
|
||||||
if (fixResp.text) {
|
|
||||||
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
|
|
||||||
}
|
}
|
||||||
|
for (const [fname, fIssues] of Object.entries(issuesByFile)) {
|
||||||
|
if (!files[fname]) continue;
|
||||||
|
const fixPrompt = `Fix the following issues in this Python file. Return ONLY the complete corrected file, no explanations.\n\nISSUES:\n${fIssues.join('\n')}\n\nCURRENT FILE (${fname}):\n\`\`\`python\n${files[fname]}\`\`\``;
|
||||||
|
const fixResp = await ollamaChat(model, fixPrompt, FIX_SYSTEM, 2048);
|
||||||
|
timings.push(fixResp);
|
||||||
|
if (fixResp.text) {
|
||||||
|
files[fname] = fixResp.text.replace(/^```(?:python)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim() + '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
issues = validateProjectCode(files);
|
||||||
}
|
}
|
||||||
issues = validateProjectCode(files);
|
result.validationIssues = issues.length;
|
||||||
}
|
}
|
||||||
result.validationIssues = issues.length;
|
|
||||||
result.fixRounds = fixRound;
|
result.fixRounds = fixRound;
|
||||||
|
|
||||||
// Kirjoita LLM:n generoimat Python-tiedostot
|
// Kirjoita LLM:n generoimat tiedostot (luo src/ ja tests/ alihakemistot tarvittaessa)
|
||||||
for (const [fn, content] of Object.entries(files)) {
|
for (const [fn, content] of Object.entries(files)) {
|
||||||
if (fn.endsWith('.py')) writeFileSync(`${dir}/${fn}`, content);
|
const filePath = join(dir, fn);
|
||||||
|
mkdirSync(dirname(filePath), { recursive: true });
|
||||||
|
writeFileSync(filePath, content);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 5. Pytest Docker-kontissa (kipina-pytest image)
|
// 5. Testit Docker-kontissa
|
||||||
console.log(` [5/5] Pytest (Docker)...`);
|
const testLabel = LANG === 'rust' ? 'Cargo test (Docker)' : 'Pytest (Docker)';
|
||||||
|
console.log(` [5/5] ${testLabel}...`);
|
||||||
|
const dockerTimeout = LANG === 'rust' ? 300000 : 120000;
|
||||||
try {
|
try {
|
||||||
const pytestOut = execSync(
|
const testOut = execSync(
|
||||||
`docker run --rm -v "${dir}:/src:ro" kipina-pytest 2>&1`,
|
`docker run --rm -v "${dir}:/src:ro" ${LCONF.dockerImage} 2>&1`,
|
||||||
{ timeout: 120000, encoding: 'utf-8' }
|
{ timeout: dockerTimeout, encoding: 'utf-8' }
|
||||||
);
|
);
|
||||||
writeFileSync(`${dir}/_pytest.txt`, pytestOut);
|
writeFileSync(`${dir}/_testout.txt`, testOut);
|
||||||
const passedMatch = pytestOut.match(/(\d+) passed/);
|
Object.assign(result, parseTestOutput(testOut));
|
||||||
const failedMatch = pytestOut.match(/(\d+) failed/);
|
|
||||||
result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
|
|
||||||
result.testsFailed = failedMatch ? parseInt(failedMatch[1]) : 0;
|
|
||||||
result.testsTotal = result.testsPassed + result.testsFailed;
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
const output = e.stdout || e.stderr || e.message || '';
|
const output = e.stdout || e.stderr || e.message || '';
|
||||||
writeFileSync(`${dir}/_pytest.txt`, output);
|
writeFileSync(`${dir}/_testout.txt`, output);
|
||||||
const passedMatch = output.match(/(\d+) passed/);
|
Object.assign(result, parseTestOutput(output));
|
||||||
const failedMatch = output.match(/(\d+) failed/);
|
if (result.testsTotal === 0) result.error = 'Testit kaatuivat';
|
||||||
const errorMatch = output.match(/(\d+) error/);
|
|
||||||
result.testsPassed = passedMatch ? parseInt(passedMatch[1]) : 0;
|
|
||||||
result.testsFailed = (failedMatch ? parseInt(failedMatch[1]) : 0) + (errorMatch ? parseInt(errorMatch[1]) : 0);
|
|
||||||
result.testsTotal = result.testsPassed + result.testsFailed;
|
|
||||||
if (result.testsTotal === 0) result.error = 'Pytest kaatui';
|
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
result.error = e.message;
|
result.error = e.message;
|
||||||
@@ -337,7 +376,7 @@ async function main() {
|
|||||||
console.log('╔══════════════════════════════════════════════╗');
|
console.log('╔══════════════════════════════════════════════╗');
|
||||||
console.log('║ Kipinä CodeBench ║');
|
console.log('║ Kipinä CodeBench ║');
|
||||||
console.log('╚══════════════════════════════════════════════╝');
|
console.log('╚══════════════════════════════════════════════╝');
|
||||||
console.log(`Ollama: ${OLLAMA_URL}${THINK_MODE ? ' 🧠 thinking ON (3× tokens)' : ''}`);
|
console.log(`Ollama: ${OLLAMA_URL} 📝 ${LANG}${THINK_MODE ? ' 🧠 thinking ON' : ''}`);
|
||||||
|
|
||||||
// Haetaan mallit
|
// Haetaan mallit
|
||||||
let models;
|
let models;
|
||||||
|
|||||||
1
kipina-codebench/golden-examples/todo-rs/.gitignore
vendored
Normal file
1
kipina-codebench/golden-examples/todo-rs/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
target/
|
||||||
46
kipina-codebench/prompts/code-rs.md
Normal file
46
kipina-codebench/prompts/code-rs.md
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
You are a Rust backend developer. Generate an Axum web project with SQLx and SQLite.
|
||||||
|
|
||||||
|
Given the project requirements, JSON specification, and a REFERENCE IMPLEMENTATION, generate these files:
|
||||||
|
|
||||||
|
1. Cargo.toml — axum 0.8, tokio, serde/serde_json, sqlx (sqlite, runtime-tokio), tower-http
|
||||||
|
2. src/models.rs — Structs with Serialize, Deserialize, FromRow derives
|
||||||
|
3. src/handlers.rs — Async handler functions for each CRUD endpoint
|
||||||
|
4. src/lib.rs — Public app() function returning Router, init_db() for table creation
|
||||||
|
5. src/main.rs — Binary entry point, connect to SQLite, bind to port
|
||||||
|
6. tests/api_test.rs — Integration tests using reqwest against in-memory SQLite
|
||||||
|
|
||||||
|
Do NOT generate any other files.
|
||||||
|
|
||||||
|
OUTPUT FORMAT — use these exact markers to separate files:
|
||||||
|
|
||||||
|
=== Cargo.toml ===
|
||||||
|
<toml content>
|
||||||
|
|
||||||
|
=== src/models.rs ===
|
||||||
|
<rust code>
|
||||||
|
|
||||||
|
=== src/handlers.rs ===
|
||||||
|
<rust code>
|
||||||
|
|
||||||
|
=== src/lib.rs ===
|
||||||
|
<rust code>
|
||||||
|
|
||||||
|
=== src/main.rs ===
|
||||||
|
<rust code>
|
||||||
|
|
||||||
|
=== tests/api_test.rs ===
|
||||||
|
<rust code>
|
||||||
|
|
||||||
|
DOCUMENTATION — every file starts with //! one-line module doc. Structs get /// one-line doc. Zensical: say what it IS, not what it does.
|
||||||
|
|
||||||
|
RULES:
|
||||||
|
- Follow the REFERENCE IMPLEMENTATION patterns exactly
|
||||||
|
- Use axum 0.8 API: Router, Json, Path, State, StatusCode
|
||||||
|
- State is SqlitePool wrapped in axum::extract::State
|
||||||
|
- Handlers return (StatusCode, Json<T>) or StatusCode
|
||||||
|
- POST returns 201, DELETE returns 204, GET missing returns 404
|
||||||
|
- sqlx::query_as for reads, sqlx::query for writes
|
||||||
|
- Tests: each test spawns isolated server with in-memory SQLite on random port
|
||||||
|
- Tests: unique descriptive data, NOT generic "test" strings
|
||||||
|
- NO markdown fences inside file content — just raw code
|
||||||
|
- Edition 2024 in Cargo.toml
|
||||||
183
kipina-codebench/results/2026-04-14T06-49.html
Normal file
183
kipina-codebench/results/2026-04-14T06-49.html
Normal file
File diff suppressed because one or more lines are too long
422
kipina-codebench/results/2026-04-14T06-49.json
Normal file
422
kipina-codebench/results/2026-04-14T06-49.json
Normal file
@@ -0,0 +1,422 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "qwen3.5:9b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 3,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 65901,
|
||||||
|
"totalTokens": 5056,
|
||||||
|
"avgTokPerSec": 82.99139473832963,
|
||||||
|
"promptChars": 12334,
|
||||||
|
"promptTokensEst": 3084,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3.5:9b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 1,
|
||||||
|
"fixRounds": 2,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 74087,
|
||||||
|
"totalTokens": 5645,
|
||||||
|
"avgTokPerSec": 83.57073831360164,
|
||||||
|
"promptChars": 10757,
|
||||||
|
"promptTokensEst": 2689,
|
||||||
|
"score": 20,
|
||||||
|
"stars": "★☆☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3.5:9b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 49830,
|
||||||
|
"totalTokens": 3803,
|
||||||
|
"avgTokPerSec": 83.26266260763309,
|
||||||
|
"promptChars": 10826,
|
||||||
|
"promptTokensEst": 2707,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "gemma4:e4b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 57032,
|
||||||
|
"totalTokens": 4924,
|
||||||
|
"avgTokPerSec": 106.02334905805122,
|
||||||
|
"promptChars": 11313,
|
||||||
|
"promptTokensEst": 2828,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "gemma4:e4b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 5,
|
||||||
|
"testsFailed": 2,
|
||||||
|
"totalDurationMs": 54307,
|
||||||
|
"totalTokens": 5060,
|
||||||
|
"avgTokPerSec": 106.89447491163497,
|
||||||
|
"promptChars": 11225,
|
||||||
|
"promptTokensEst": 2806,
|
||||||
|
"score": 83,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "gemma4:e4b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 11,
|
||||||
|
"testsPassed": 2,
|
||||||
|
"testsFailed": 9,
|
||||||
|
"totalDurationMs": 57080,
|
||||||
|
"totalTokens": 5310,
|
||||||
|
"avgTokPerSec": 106.64914988130955,
|
||||||
|
"promptChars": 11791,
|
||||||
|
"promptTokensEst": 2948,
|
||||||
|
"score": 51,
|
||||||
|
"stars": "★★★☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen2.5-coder:3b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 3,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 22377,
|
||||||
|
"totalTokens": 3534,
|
||||||
|
"avgTokPerSec": 201.24475679283708,
|
||||||
|
"promptChars": 11479,
|
||||||
|
"promptTokensEst": 2870,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen2.5-coder:3b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 8,
|
||||||
|
"fixRounds": 2,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 44520,
|
||||||
|
"totalTokens": 7495,
|
||||||
|
"avgTokPerSec": 201.87149050701015,
|
||||||
|
"promptChars": 11886,
|
||||||
|
"promptTokensEst": 2972,
|
||||||
|
"score": 20,
|
||||||
|
"stars": "★☆☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen2.5-coder:3b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 20136,
|
||||||
|
"totalTokens": 3338,
|
||||||
|
"avgTokPerSec": 200.86152095722105,
|
||||||
|
"promptChars": 11228,
|
||||||
|
"promptTokensEst": 2807,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen2.5-coder:7b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen2.5-coder:7b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 20012,
|
||||||
|
"totalTokens": 2119,
|
||||||
|
"avgTokPerSec": 122.7557304112134,
|
||||||
|
"promptChars": 10342,
|
||||||
|
"promptTokensEst": 2586,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen2.5-coder:7b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 26133,
|
||||||
|
"totalTokens": 2715,
|
||||||
|
"avgTokPerSec": 121.94987205993503,
|
||||||
|
"promptChars": 11193,
|
||||||
|
"promptTokensEst": 2798,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 44757,
|
||||||
|
"totalTokens": 2156,
|
||||||
|
"avgTokPerSec": 60.77636586631207,
|
||||||
|
"promptChars": 9635,
|
||||||
|
"promptTokensEst": 2409,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 41166,
|
||||||
|
"totalTokens": 2282,
|
||||||
|
"avgTokPerSec": 61.14821289733007,
|
||||||
|
"promptChars": 9575,
|
||||||
|
"promptTokensEst": 2394,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 12,
|
||||||
|
"testsPassed": 12,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 66478,
|
||||||
|
"totalTokens": 3681,
|
||||||
|
"avgTokPerSec": 60.493817783668725,
|
||||||
|
"promptChars": 10500,
|
||||||
|
"promptTokensEst": 2625,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 29801,
|
||||||
|
"totalTokens": 2249,
|
||||||
|
"avgTokPerSec": 98.5661742189331,
|
||||||
|
"promptChars": 9615,
|
||||||
|
"promptTokensEst": 2404,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 2,
|
||||||
|
"totalDurationMs": 22974,
|
||||||
|
"totalTokens": 2050,
|
||||||
|
"avgTokPerSec": 101.2398768597589,
|
||||||
|
"promptChars": 9273,
|
||||||
|
"promptTokensEst": 2318,
|
||||||
|
"score": 85,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 12,
|
||||||
|
"testsPassed": 12,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 39335,
|
||||||
|
"totalTokens": 3537,
|
||||||
|
"avgTokPerSec": 100.10984073540648,
|
||||||
|
"promptChars": 10525,
|
||||||
|
"promptTokensEst": 2631,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:4b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 58668,
|
||||||
|
"totalTokens": 7134,
|
||||||
|
"avgTokPerSec": 141.76822189196028,
|
||||||
|
"promptChars": 15202,
|
||||||
|
"promptTokensEst": 3801,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:4b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:4b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui"
|
||||||
|
}
|
||||||
|
]
|
||||||
183
kipina-codebench/results/2026-04-14T07-13.html
Normal file
183
kipina-codebench/results/2026-04-14T07-13.html
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="fi">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Kipina Model Benchmark</title>
|
||||||
|
<style>
|
||||||
|
:root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
|
||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
|
||||||
|
h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
|
||||||
|
.meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
|
||||||
|
.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
||||||
|
.card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
|
||||||
|
.card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
|
||||||
|
.card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
|
||||||
|
.card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
|
||||||
|
table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
|
||||||
|
th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
|
||||||
|
th:hover { color: var(--text); }
|
||||||
|
th.sorted-asc::after { content: ' ▲'; }
|
||||||
|
th.sorted-desc::after { content: ' ▼'; }
|
||||||
|
td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
|
||||||
|
tr:hover td { background: #1c2128; }
|
||||||
|
.pass { color: var(--green); }
|
||||||
|
.partial { color: var(--yellow); }
|
||||||
|
.fail { color: var(--red); }
|
||||||
|
.stars { letter-spacing: 1px; }
|
||||||
|
.bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
|
||||||
|
.bar-bg { background: var(--border); }
|
||||||
|
.bar-fill { background: var(--green); }
|
||||||
|
.bar-partial { background: var(--yellow); }
|
||||||
|
.model-name { font-weight: 600; }
|
||||||
|
h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
|
||||||
|
.summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>Kipina Model Benchmark</h1>
|
||||||
|
<div class="meta" id="meta"></div>
|
||||||
|
|
||||||
|
<div class="cards" id="cards"></div>
|
||||||
|
|
||||||
|
<h2>Mallikohtainen yhteenveto</h2>
|
||||||
|
<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<h2>Kaikki tulokset</h2>
|
||||||
|
<table id="results-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const RAW = [{"model":"qwen3:14b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":186642,"totalTokens":10237,"avgTokPerSec":59.06411550065281,"promptChars":10576,"promptTokensEst":2644,"score":40,"stars":"★★☆☆☆","error":null},{"model":"qwen3:14b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":121848,"totalTokens":6735,"avgTokPerSec":59.85231850668119,"promptChars":9684,"promptTokensEst":2421,"score":40,"stars":"★★☆☆☆","error":null},{"model":"qwen3:14b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":11,"testsPassed":9,"testsFailed":2,"totalDurationMs":83491,"totalTokens":4677,"avgTokPerSec":60.222832434869694,"promptChars":10423,"promptTokensEst":2606,"score":89,"stars":"★★★★☆","error":null},{"model":"qwen3:8b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":56288,"totalTokens":5235,"avgTokPerSec":99.60027546406452,"promptChars":9307,"promptTokensEst":2327,"score":100,"stars":"★★★★★","error":null},{"model":"qwen3:8b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":5,"testsFailed":1,"totalDurationMs":59639,"totalTokens":5526,"avgTokPerSec":99.6742208632186,"promptChars":9158,"promptTokensEst":2290,"score":90,"stars":"★★★★★","error":null},{"model":"qwen3:8b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":11,"testsPassed":10,"testsFailed":1,"totalDurationMs":131793,"totalTokens":11779,"avgTokPerSec":97.17878362853351,"promptChars":10390,"promptTokensEst":2598,"score":95,"stars":"★★★★★","error":null}];
|
||||||
|
|
||||||
|
const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
|
||||||
|
function calcScore(r) {
|
||||||
|
if (r.error && r.testsTotal === 0) return 0;
|
||||||
|
let s = 0;
|
||||||
|
if (r.specOk) s += 10;
|
||||||
|
if (!r.error || r.testsTotal > 0) s += 10;
|
||||||
|
if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||||||
|
s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
|
||||||
|
return Math.min(100, s);
|
||||||
|
}
|
||||||
|
// Laske pisteet jos puuttuvat
|
||||||
|
const DATA = RAW.map(r => {
|
||||||
|
if (r.score == null) r.score = calcScore(r);
|
||||||
|
if (!r.stars) r.stars = starsFor(r.score);
|
||||||
|
if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
|
||||||
|
const pctBar = (passed, total, w=80) => {
|
||||||
|
if (total === 0) return '-';
|
||||||
|
const pct = passed/total*100;
|
||||||
|
const c = pct === 100 ? 'bar-fill' : 'bar-partial';
|
||||||
|
return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Meta
|
||||||
|
const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
|
||||||
|
|
||||||
|
// Cards
|
||||||
|
const models = [...new Set(DATA.map(r => r.model))];
|
||||||
|
const scenarios = [...new Set(DATA.map(r => r.scenario))];
|
||||||
|
const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
|
||||||
|
const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
|
||||||
|
const bestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.avg - a.avg)[0];
|
||||||
|
const fastestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.speed - a.speed)[0];
|
||||||
|
|
||||||
|
document.getElementById('cards').innerHTML = `
|
||||||
|
<div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
|
||||||
|
<div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
|
||||||
|
<div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
|
||||||
|
<div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
|
||||||
|
<div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
|
||||||
|
<div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
|
||||||
|
`;
|
||||||
|
|
||||||
|
// Summary table
|
||||||
|
const sumHead = document.querySelector('#summary-table thead');
|
||||||
|
const sumBody = document.querySelector('#summary-table tbody');
|
||||||
|
sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
|
||||||
|
|
||||||
|
const modelRows = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
|
||||||
|
const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
|
||||||
|
const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
|
||||||
|
const scenCols = scenarios.map(s => {
|
||||||
|
const r = mrs.find(r => r.scenario === s);
|
||||||
|
if (!r) return '<td>-</td>';
|
||||||
|
return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
|
||||||
|
}).join('');
|
||||||
|
return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
|
||||||
|
}).sort((a,b) => b.avg - a.avg);
|
||||||
|
sumBody.innerHTML = modelRows.map(r => r.html).join('');
|
||||||
|
|
||||||
|
// Results table
|
||||||
|
const resHead = document.querySelector('#results-table thead');
|
||||||
|
const resBody = document.querySelector('#results-table tbody');
|
||||||
|
const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
|
||||||
|
resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
|
||||||
|
|
||||||
|
let sortCol = 9, sortAsc = false;
|
||||||
|
function renderResults() {
|
||||||
|
const sorted = [...DATA].sort((a,b) => {
|
||||||
|
const vals = [
|
||||||
|
[a.model, b.model],
|
||||||
|
[a.scenario, b.scenario],
|
||||||
|
[a.specEntities, b.specEntities],
|
||||||
|
[a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
|
||||||
|
[a.fixRounds, b.fixRounds],
|
||||||
|
[a.promptTokensEst, b.promptTokensEst],
|
||||||
|
[a.totalTokens, b.totalTokens],
|
||||||
|
[a.totalDurationMs, b.totalDurationMs],
|
||||||
|
[a.avgTokPerSec, b.avgTokPerSec],
|
||||||
|
[a.score, b.score],
|
||||||
|
][sortCol];
|
||||||
|
const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
|
||||||
|
return sortAsc ? cmp : -cmp;
|
||||||
|
});
|
||||||
|
resBody.innerHTML = sorted.map(r => {
|
||||||
|
const c = cls(r);
|
||||||
|
return `<tr>
|
||||||
|
<td class="model-name">${r.model}</td>
|
||||||
|
<td>${r.scenario}</td>
|
||||||
|
<td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
|
||||||
|
<td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
|
||||||
|
<td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
|
||||||
|
<td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
|
||||||
|
<td>${r.avgTokPerSec.toFixed(0)}</td>
|
||||||
|
<td><span class="stars">${r.stars}</span> ${r.score}p</td>
|
||||||
|
</tr>`;
|
||||||
|
}).join('');
|
||||||
|
document.querySelectorAll('#results-table th').forEach((th,i) => {
|
||||||
|
th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.querySelector('#results-table thead').addEventListener('click', e => {
|
||||||
|
const col = parseInt(e.target.dataset.col);
|
||||||
|
if (isNaN(col)) return;
|
||||||
|
if (sortCol === col) sortAsc = !sortAsc;
|
||||||
|
else { sortCol = col; sortAsc = false; }
|
||||||
|
renderResults();
|
||||||
|
});
|
||||||
|
renderResults();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
122
kipina-codebench/results/2026-04-14T07-13.json
Normal file
122
kipina-codebench/results/2026-04-14T07-13.json
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 186642,
|
||||||
|
"totalTokens": 10237,
|
||||||
|
"avgTokPerSec": 59.06411550065281,
|
||||||
|
"promptChars": 10576,
|
||||||
|
"promptTokensEst": 2644,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 121848,
|
||||||
|
"totalTokens": 6735,
|
||||||
|
"avgTokPerSec": 59.85231850668119,
|
||||||
|
"promptChars": 9684,
|
||||||
|
"promptTokensEst": 2421,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 11,
|
||||||
|
"testsPassed": 9,
|
||||||
|
"testsFailed": 2,
|
||||||
|
"totalDurationMs": 83491,
|
||||||
|
"totalTokens": 4677,
|
||||||
|
"avgTokPerSec": 60.222832434869694,
|
||||||
|
"promptChars": 10423,
|
||||||
|
"promptTokensEst": 2606,
|
||||||
|
"score": 89,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 56288,
|
||||||
|
"totalTokens": 5235,
|
||||||
|
"avgTokPerSec": 99.60027546406452,
|
||||||
|
"promptChars": 9307,
|
||||||
|
"promptTokensEst": 2327,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 5,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 59639,
|
||||||
|
"totalTokens": 5526,
|
||||||
|
"avgTokPerSec": 99.6742208632186,
|
||||||
|
"promptChars": 9158,
|
||||||
|
"promptTokensEst": 2290,
|
||||||
|
"score": 90,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 11,
|
||||||
|
"testsPassed": 10,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 131793,
|
||||||
|
"totalTokens": 11779,
|
||||||
|
"avgTokPerSec": 97.17878362853351,
|
||||||
|
"promptChars": 10390,
|
||||||
|
"promptTokensEst": 2598,
|
||||||
|
"score": 95,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
}
|
||||||
|
]
|
||||||
183
kipina-codebench/results/2026-04-14T07-18.html
Normal file
183
kipina-codebench/results/2026-04-14T07-18.html
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="fi">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Kipina Model Benchmark</title>
|
||||||
|
<style>
|
||||||
|
:root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
|
||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
|
||||||
|
h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
|
||||||
|
.meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
|
||||||
|
.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
||||||
|
.card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
|
||||||
|
.card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
|
||||||
|
.card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
|
||||||
|
.card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
|
||||||
|
table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
|
||||||
|
th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
|
||||||
|
th:hover { color: var(--text); }
|
||||||
|
th.sorted-asc::after { content: ' ▲'; }
|
||||||
|
th.sorted-desc::after { content: ' ▼'; }
|
||||||
|
td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
|
||||||
|
tr:hover td { background: #1c2128; }
|
||||||
|
.pass { color: var(--green); }
|
||||||
|
.partial { color: var(--yellow); }
|
||||||
|
.fail { color: var(--red); }
|
||||||
|
.stars { letter-spacing: 1px; }
|
||||||
|
.bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
|
||||||
|
.bar-bg { background: var(--border); }
|
||||||
|
.bar-fill { background: var(--green); }
|
||||||
|
.bar-partial { background: var(--yellow); }
|
||||||
|
.model-name { font-weight: 600; }
|
||||||
|
h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
|
||||||
|
.summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>Kipina Model Benchmark</h1>
|
||||||
|
<div class="meta" id="meta"></div>
|
||||||
|
|
||||||
|
<div class="cards" id="cards"></div>
|
||||||
|
|
||||||
|
<h2>Mallikohtainen yhteenveto</h2>
|
||||||
|
<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<h2>Kaikki tulokset</h2>
|
||||||
|
<table id="results-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const RAW = [{"model":"qwen3:14b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":66903,"totalTokens":5454,"avgTokPerSec":86.45918994499432,"promptChars":9985,"promptTokensEst":2496,"score":40,"stars":"★★☆☆☆","error":null},{"model":"qwen3:14b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":87618,"totalTokens":7150,"avgTokPerSec":87.21782190501095,"promptChars":9922,"promptTokensEst":2481,"score":40,"stars":"★★☆☆☆","error":null},{"model":"qwen3:14b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":9,"testsPassed":5,"testsFailed":4,"totalDurationMs":78398,"totalTokens":6427,"avgTokPerSec":85.52353711143463,"promptChars":10737,"promptTokensEst":2684,"score":73,"stars":"★★★★☆","error":null},{"model":"qwen3:8b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":8,"testsPassed":7,"testsFailed":1,"totalDurationMs":82750,"totalTokens":10054,"avgTokPerSec":139.90690936146032,"promptChars":9360,"promptTokensEst":2340,"score":93,"stars":"★★★★★","error":null},{"model":"qwen3:8b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":32233,"totalTokens":4404,"avgTokPerSec":143.4997404058814,"promptChars":9310,"promptTokensEst":2328,"score":100,"stars":"★★★★★","error":null},{"model":"qwen3:8b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":0,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":88563,"totalTokens":11575,"avgTokPerSec":141.54675017528362,"promptChars":10567,"promptTokensEst":2642,"score":40,"stars":"★★☆☆☆","error":null}];
|
||||||
|
|
||||||
|
const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
|
||||||
|
function calcScore(r) {
|
||||||
|
if (r.error && r.testsTotal === 0) return 0;
|
||||||
|
let s = 0;
|
||||||
|
if (r.specOk) s += 10;
|
||||||
|
if (!r.error || r.testsTotal > 0) s += 10;
|
||||||
|
if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||||||
|
s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
|
||||||
|
return Math.min(100, s);
|
||||||
|
}
|
||||||
|
// Laske pisteet jos puuttuvat
|
||||||
|
const DATA = RAW.map(r => {
|
||||||
|
if (r.score == null) r.score = calcScore(r);
|
||||||
|
if (!r.stars) r.stars = starsFor(r.score);
|
||||||
|
if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
|
||||||
|
const pctBar = (passed, total, w=80) => {
|
||||||
|
if (total === 0) return '-';
|
||||||
|
const pct = passed/total*100;
|
||||||
|
const c = pct === 100 ? 'bar-fill' : 'bar-partial';
|
||||||
|
return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Meta
|
||||||
|
const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
|
||||||
|
|
||||||
|
// Cards
|
||||||
|
const models = [...new Set(DATA.map(r => r.model))];
|
||||||
|
const scenarios = [...new Set(DATA.map(r => r.scenario))];
|
||||||
|
const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
|
||||||
|
const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
|
||||||
|
const bestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.avg - a.avg)[0];
|
||||||
|
const fastestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.speed - a.speed)[0];
|
||||||
|
|
||||||
|
document.getElementById('cards').innerHTML = `
|
||||||
|
<div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
|
||||||
|
<div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
|
||||||
|
<div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
|
||||||
|
<div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
|
||||||
|
<div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
|
||||||
|
<div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
|
||||||
|
`;
|
||||||
|
|
||||||
|
// Summary table
|
||||||
|
const sumHead = document.querySelector('#summary-table thead');
|
||||||
|
const sumBody = document.querySelector('#summary-table tbody');
|
||||||
|
sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
|
||||||
|
|
||||||
|
const modelRows = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
|
||||||
|
const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
|
||||||
|
const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
|
||||||
|
const scenCols = scenarios.map(s => {
|
||||||
|
const r = mrs.find(r => r.scenario === s);
|
||||||
|
if (!r) return '<td>-</td>';
|
||||||
|
return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
|
||||||
|
}).join('');
|
||||||
|
return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
|
||||||
|
}).sort((a,b) => b.avg - a.avg);
|
||||||
|
sumBody.innerHTML = modelRows.map(r => r.html).join('');
|
||||||
|
|
||||||
|
// Results table
|
||||||
|
const resHead = document.querySelector('#results-table thead');
|
||||||
|
const resBody = document.querySelector('#results-table tbody');
|
||||||
|
const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
|
||||||
|
resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
|
||||||
|
|
||||||
|
let sortCol = 9, sortAsc = false;
|
||||||
|
function renderResults() {
|
||||||
|
const sorted = [...DATA].sort((a,b) => {
|
||||||
|
const vals = [
|
||||||
|
[a.model, b.model],
|
||||||
|
[a.scenario, b.scenario],
|
||||||
|
[a.specEntities, b.specEntities],
|
||||||
|
[a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
|
||||||
|
[a.fixRounds, b.fixRounds],
|
||||||
|
[a.promptTokensEst, b.promptTokensEst],
|
||||||
|
[a.totalTokens, b.totalTokens],
|
||||||
|
[a.totalDurationMs, b.totalDurationMs],
|
||||||
|
[a.avgTokPerSec, b.avgTokPerSec],
|
||||||
|
[a.score, b.score],
|
||||||
|
][sortCol];
|
||||||
|
const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
|
||||||
|
return sortAsc ? cmp : -cmp;
|
||||||
|
});
|
||||||
|
resBody.innerHTML = sorted.map(r => {
|
||||||
|
const c = cls(r);
|
||||||
|
return `<tr>
|
||||||
|
<td class="model-name">${r.model}</td>
|
||||||
|
<td>${r.scenario}</td>
|
||||||
|
<td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
|
||||||
|
<td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
|
||||||
|
<td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
|
||||||
|
<td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
|
||||||
|
<td>${r.avgTokPerSec.toFixed(0)}</td>
|
||||||
|
<td><span class="stars">${r.stars}</span> ${r.score}p</td>
|
||||||
|
</tr>`;
|
||||||
|
}).join('');
|
||||||
|
document.querySelectorAll('#results-table th').forEach((th,i) => {
|
||||||
|
th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.querySelector('#results-table thead').addEventListener('click', e => {
|
||||||
|
const col = parseInt(e.target.dataset.col);
|
||||||
|
if (isNaN(col)) return;
|
||||||
|
if (sortCol === col) sortAsc = !sortAsc;
|
||||||
|
else { sortCol = col; sortAsc = false; }
|
||||||
|
renderResults();
|
||||||
|
});
|
||||||
|
renderResults();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
122
kipina-codebench/results/2026-04-14T07-18.json
Normal file
122
kipina-codebench/results/2026-04-14T07-18.json
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 66903,
|
||||||
|
"totalTokens": 5454,
|
||||||
|
"avgTokPerSec": 86.45918994499432,
|
||||||
|
"promptChars": 9985,
|
||||||
|
"promptTokensEst": 2496,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 87618,
|
||||||
|
"totalTokens": 7150,
|
||||||
|
"avgTokPerSec": 87.21782190501095,
|
||||||
|
"promptChars": 9922,
|
||||||
|
"promptTokensEst": 2481,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 9,
|
||||||
|
"testsPassed": 5,
|
||||||
|
"testsFailed": 4,
|
||||||
|
"totalDurationMs": 78398,
|
||||||
|
"totalTokens": 6427,
|
||||||
|
"avgTokPerSec": 85.52353711143463,
|
||||||
|
"promptChars": 10737,
|
||||||
|
"promptTokensEst": 2684,
|
||||||
|
"score": 73,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 82750,
|
||||||
|
"totalTokens": 10054,
|
||||||
|
"avgTokPerSec": 139.90690936146032,
|
||||||
|
"promptChars": 9360,
|
||||||
|
"promptTokensEst": 2340,
|
||||||
|
"score": 93,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 32233,
|
||||||
|
"totalTokens": 4404,
|
||||||
|
"avgTokPerSec": 143.4997404058814,
|
||||||
|
"promptChars": 9310,
|
||||||
|
"promptTokensEst": 2328,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 88563,
|
||||||
|
"totalTokens": 11575,
|
||||||
|
"avgTokPerSec": 141.54675017528362,
|
||||||
|
"promptChars": 10567,
|
||||||
|
"promptTokensEst": 2642,
|
||||||
|
"score": 40,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null
|
||||||
|
}
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user