CodeBench: mallikohtaiset promptiprofiilit (profiles.json)
- profiles.json: malli → profiili → prompti -mappaus - code-small.md: tiivistetty prompti pienille malleille (8b, 4b) - benchmark valitsee automaattisesti oikean promptin mallin perusteella - qwen3-coder:30b → code.md (large), qwen3:8b → code-small.md (small)
This commit is contained in:
@@ -46,9 +46,24 @@ function loadPrompt(name) {
|
|||||||
}
|
}
|
||||||
const CLIENT_SYSTEM = loadPrompt('client');
|
const CLIENT_SYSTEM = loadPrompt('client');
|
||||||
const SPEC_SYSTEM = loadPrompt('spec');
|
const SPEC_SYSTEM = loadPrompt('spec');
|
||||||
const CODE_SYSTEM = loadPrompt(LANG === 'rust' ? 'code-rs' : 'code');
|
|
||||||
const FIX_SYSTEM = loadPrompt('fix');
|
const FIX_SYSTEM = loadPrompt('fix');
|
||||||
|
|
||||||
|
// === Mallikohtaiset profiilit ===
|
||||||
|
const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8'));
|
||||||
|
function getCodePromptForModel(model) {
|
||||||
|
const modelConf = PROFILES.models[model];
|
||||||
|
const profile = modelConf?.profile || PROFILES.default_profile;
|
||||||
|
const promptName = modelConf?.prompt || PROFILES.profiles[profile]?.prompt || 'code';
|
||||||
|
const suffix = LANG === 'rust' ? '-rs' : '';
|
||||||
|
// Yritä kielispesifistä ensin (code-small-rs), sitten perus (code-small)
|
||||||
|
const candidates = [`${promptName}${suffix}`, promptName, `code${suffix}`, 'code'];
|
||||||
|
for (const name of candidates) {
|
||||||
|
const path = join(__dirname, 'prompts', `${name}.md`);
|
||||||
|
if (existsSync(path)) return { system: readFileSync(path, 'utf-8').trim(), promptName: name, profile };
|
||||||
|
}
|
||||||
|
return { system: loadPrompt('code'), promptName: 'code', profile: 'large' };
|
||||||
|
}
|
||||||
|
|
||||||
// === Kultaisten esimerkkien lataus (kielen mukaan) ===
|
// === Kultaisten esimerkkien lataus (kielen mukaan) ===
|
||||||
const GOLDEN_DIR = join(__dirname, 'golden-examples');
|
const GOLDEN_DIR = join(__dirname, 'golden-examples');
|
||||||
const LANG_CONFIG = {
|
const LANG_CONFIG = {
|
||||||
@@ -281,6 +296,7 @@ async function runPipeline(model, scenario) {
|
|||||||
error: null,
|
error: null,
|
||||||
};
|
};
|
||||||
const timings = [];
|
const timings = [];
|
||||||
|
const { system: CODE_SYSTEM, promptName, profile } = getCodePromptForModel(model);
|
||||||
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
|
const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`;
|
||||||
mkdirSync(dir, { recursive: true });
|
mkdirSync(dir, { recursive: true });
|
||||||
|
|
||||||
@@ -444,6 +460,8 @@ async function runPipeline(model, scenario) {
|
|||||||
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
|
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
|
||||||
result.score = scoreResult(result);
|
result.score = scoreResult(result);
|
||||||
result.stars = starsForScore(result.score);
|
result.stars = starsForScore(result.score);
|
||||||
|
result.profile = profile;
|
||||||
|
result.promptName = promptName;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|||||||
47
kipina-codebench/profiles.json
Normal file
47
kipina-codebench/profiles.json
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
{
|
||||||
|
"models": {
|
||||||
|
"qwen3-coder:30b": {
|
||||||
|
"profile": "large",
|
||||||
|
"prompt": "code",
|
||||||
|
"golden": "todo.md",
|
||||||
|
"notes": "Pääkooderi. 97p, 188 tok/s. Noudattaa pitkiä sääntölistoja."
|
||||||
|
},
|
||||||
|
"qwen3:8b": {
|
||||||
|
"profile": "small",
|
||||||
|
"prompt": "code-small",
|
||||||
|
"golden": "todo.md",
|
||||||
|
"notes": "Kevyt vaihtoehto. Todo/users 100p, blog heikko. Lyhyt prompti toimii paremmin."
|
||||||
|
},
|
||||||
|
"qwen3:14b": {
|
||||||
|
"profile": "large",
|
||||||
|
"prompt": "code",
|
||||||
|
"golden": "todo.md",
|
||||||
|
"notes": "Poistettu käytöstä. Ei lisäarvoa 30b:hen verrattuna."
|
||||||
|
},
|
||||||
|
"codestral:22b": {
|
||||||
|
"profile": "large",
|
||||||
|
"prompt": "code",
|
||||||
|
"golden": "todo.md",
|
||||||
|
"notes": "Mistral-varamalli. 88p, 44 tok/s."
|
||||||
|
},
|
||||||
|
"qwen3:4b": {
|
||||||
|
"profile": "small",
|
||||||
|
"prompt": "code-small",
|
||||||
|
"golden": "todo.md",
|
||||||
|
"notes": "Minimaali. Vain todo toimii."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"profiles": {
|
||||||
|
"large": {
|
||||||
|
"prompt": "code",
|
||||||
|
"golden": "todo.md",
|
||||||
|
"description": "Täysi prompti + säännöt. Malleille >=14B."
|
||||||
|
},
|
||||||
|
"small": {
|
||||||
|
"prompt": "code-small",
|
||||||
|
"golden": "todo.md",
|
||||||
|
"description": "Tiivistetty prompti. Malleille <=8B."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"default_profile": "large"
|
||||||
|
}
|
||||||
22
kipina-codebench/prompts/code-small.md
Normal file
22
kipina-codebench/prompts/code-small.md
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
Generate a FastAPI project with SQLAlchemy and SQLite. Follow the REFERENCE IMPLEMENTATION exactly.
|
||||||
|
|
||||||
|
Generate these 4 files with === markers:
|
||||||
|
|
||||||
|
=== models.py ===
|
||||||
|
=== schemas.py ===
|
||||||
|
=== main.py ===
|
||||||
|
=== test_main.py ===
|
||||||
|
|
||||||
|
Key patterns (copy from reference):
|
||||||
|
- class Base(DeclarativeBase): pass
|
||||||
|
- Mapped[str] = mapped_column(String(255))
|
||||||
|
- Mapped[str | None] = mapped_column(Text, default=None)
|
||||||
|
- model_config = ConfigDict(from_attributes=True)
|
||||||
|
- model_dump() not dict()
|
||||||
|
- POST 201, GET list, GET by id 404, PUT, DELETE 204
|
||||||
|
|
||||||
|
CRITICAL:
|
||||||
|
- Use ONLY fields from the JSON spec — no created_at or extra fields
|
||||||
|
- Generate EXACTLY 6 tests per entity: create, list, get_by_id, not_found, update, delete
|
||||||
|
- No search, filter, or other extra tests
|
||||||
|
- No markdown fences in output
|
||||||
183
kipina-codebench/results/2026-04-14T09-43.html
Normal file
183
kipina-codebench/results/2026-04-14T09-43.html
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="fi">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Kipina Model Benchmark</title>
|
||||||
|
<style>
|
||||||
|
:root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
|
||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
|
||||||
|
h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
|
||||||
|
.meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
|
||||||
|
.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
||||||
|
.card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
|
||||||
|
.card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
|
||||||
|
.card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
|
||||||
|
.card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
|
||||||
|
table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
|
||||||
|
th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
|
||||||
|
th:hover { color: var(--text); }
|
||||||
|
th.sorted-asc::after { content: ' ▲'; }
|
||||||
|
th.sorted-desc::after { content: ' ▼'; }
|
||||||
|
td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
|
||||||
|
tr:hover td { background: #1c2128; }
|
||||||
|
.pass { color: var(--green); }
|
||||||
|
.partial { color: var(--yellow); }
|
||||||
|
.fail { color: var(--red); }
|
||||||
|
.stars { letter-spacing: 1px; }
|
||||||
|
.bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
|
||||||
|
.bar-bg { background: var(--border); }
|
||||||
|
.bar-fill { background: var(--green); }
|
||||||
|
.bar-partial { background: var(--yellow); }
|
||||||
|
.model-name { font-weight: 600; }
|
||||||
|
h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
|
||||||
|
.summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>Kipina Model Benchmark</h1>
|
||||||
|
<div class="meta" id="meta"></div>
|
||||||
|
|
||||||
|
<div class="cards" id="cards"></div>
|
||||||
|
|
||||||
|
<h2>Mallikohtainen yhteenveto</h2>
|
||||||
|
<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<h2>Kaikki tulokset</h2>
|
||||||
|
<table id="results-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const RAW = [{"model":"qwen3-coder:30b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":21688,"totalTokens":2243,"avgTokPerSec":121.7719614197307,"promptChars":11588,"promptTokensEst":2897,"score":100,"stars":"★★★★★","error":null}];
|
||||||
|
|
||||||
|
const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
|
||||||
|
function calcScore(r) {
|
||||||
|
if (r.error && r.testsTotal === 0) return 0;
|
||||||
|
let s = 0;
|
||||||
|
if (r.specOk) s += 10;
|
||||||
|
if (!r.error || r.testsTotal > 0) s += 10;
|
||||||
|
if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||||||
|
s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
|
||||||
|
return Math.min(100, s);
|
||||||
|
}
|
||||||
|
// Laske pisteet jos puuttuvat
|
||||||
|
const DATA = RAW.map(r => {
|
||||||
|
if (r.score == null) r.score = calcScore(r);
|
||||||
|
if (!r.stars) r.stars = starsFor(r.score);
|
||||||
|
if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
|
||||||
|
const pctBar = (passed, total, w=80) => {
|
||||||
|
if (total === 0) return '-';
|
||||||
|
const pct = passed/total*100;
|
||||||
|
const c = pct === 100 ? 'bar-fill' : 'bar-partial';
|
||||||
|
return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Meta
|
||||||
|
const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
|
||||||
|
|
||||||
|
// Cards
|
||||||
|
const models = [...new Set(DATA.map(r => r.model))];
|
||||||
|
const scenarios = [...new Set(DATA.map(r => r.scenario))];
|
||||||
|
const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
|
||||||
|
const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
|
||||||
|
const bestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.avg - a.avg)[0];
|
||||||
|
const fastestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.speed - a.speed)[0];
|
||||||
|
|
||||||
|
document.getElementById('cards').innerHTML = `
|
||||||
|
<div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
|
||||||
|
<div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
|
||||||
|
<div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
|
||||||
|
<div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
|
||||||
|
<div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
|
||||||
|
<div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
|
||||||
|
`;
|
||||||
|
|
||||||
|
// Summary table
|
||||||
|
const sumHead = document.querySelector('#summary-table thead');
|
||||||
|
const sumBody = document.querySelector('#summary-table tbody');
|
||||||
|
sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
|
||||||
|
|
||||||
|
const modelRows = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
|
||||||
|
const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
|
||||||
|
const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
|
||||||
|
const scenCols = scenarios.map(s => {
|
||||||
|
const r = mrs.find(r => r.scenario === s);
|
||||||
|
if (!r) return '<td>-</td>';
|
||||||
|
return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
|
||||||
|
}).join('');
|
||||||
|
return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
|
||||||
|
}).sort((a,b) => b.avg - a.avg);
|
||||||
|
sumBody.innerHTML = modelRows.map(r => r.html).join('');
|
||||||
|
|
||||||
|
// Results table
|
||||||
|
const resHead = document.querySelector('#results-table thead');
|
||||||
|
const resBody = document.querySelector('#results-table tbody');
|
||||||
|
const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
|
||||||
|
resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
|
||||||
|
|
||||||
|
let sortCol = 9, sortAsc = false;
|
||||||
|
function renderResults() {
|
||||||
|
const sorted = [...DATA].sort((a,b) => {
|
||||||
|
const vals = [
|
||||||
|
[a.model, b.model],
|
||||||
|
[a.scenario, b.scenario],
|
||||||
|
[a.specEntities, b.specEntities],
|
||||||
|
[a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
|
||||||
|
[a.fixRounds, b.fixRounds],
|
||||||
|
[a.promptTokensEst, b.promptTokensEst],
|
||||||
|
[a.totalTokens, b.totalTokens],
|
||||||
|
[a.totalDurationMs, b.totalDurationMs],
|
||||||
|
[a.avgTokPerSec, b.avgTokPerSec],
|
||||||
|
[a.score, b.score],
|
||||||
|
][sortCol];
|
||||||
|
const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
|
||||||
|
return sortAsc ? cmp : -cmp;
|
||||||
|
});
|
||||||
|
resBody.innerHTML = sorted.map(r => {
|
||||||
|
const c = cls(r);
|
||||||
|
return `<tr>
|
||||||
|
<td class="model-name">${r.model}</td>
|
||||||
|
<td>${r.scenario}</td>
|
||||||
|
<td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
|
||||||
|
<td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
|
||||||
|
<td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
|
||||||
|
<td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
|
||||||
|
<td>${r.avgTokPerSec.toFixed(0)}</td>
|
||||||
|
<td><span class="stars">${r.stars}</span> ${r.score}p</td>
|
||||||
|
</tr>`;
|
||||||
|
}).join('');
|
||||||
|
document.querySelectorAll('#results-table th').forEach((th,i) => {
|
||||||
|
th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.querySelector('#results-table thead').addEventListener('click', e => {
|
||||||
|
const col = parseInt(e.target.dataset.col);
|
||||||
|
if (isNaN(col)) return;
|
||||||
|
if (sortCol === col) sortAsc = !sortAsc;
|
||||||
|
else { sortCol = col; sortAsc = false; }
|
||||||
|
renderResults();
|
||||||
|
});
|
||||||
|
renderResults();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
22
kipina-codebench/results/2026-04-14T09-43.json
Normal file
22
kipina-codebench/results/2026-04-14T09-43.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 21688,
|
||||||
|
"totalTokens": 2243,
|
||||||
|
"avgTokPerSec": 121.7719614197307,
|
||||||
|
"promptChars": 11588,
|
||||||
|
"promptTokensEst": 2897,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
}
|
||||||
|
]
|
||||||
183
kipina-codebench/results/2026-04-14T09-44.html
Normal file
183
kipina-codebench/results/2026-04-14T09-44.html
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="fi">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Kipina Model Benchmark</title>
|
||||||
|
<style>
|
||||||
|
:root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
|
||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
|
||||||
|
h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
|
||||||
|
.meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
|
||||||
|
.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
||||||
|
.card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
|
||||||
|
.card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
|
||||||
|
.card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
|
||||||
|
.card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
|
||||||
|
table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
|
||||||
|
th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
|
||||||
|
th:hover { color: var(--text); }
|
||||||
|
th.sorted-asc::after { content: ' ▲'; }
|
||||||
|
th.sorted-desc::after { content: ' ▼'; }
|
||||||
|
td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
|
||||||
|
tr:hover td { background: #1c2128; }
|
||||||
|
.pass { color: var(--green); }
|
||||||
|
.partial { color: var(--yellow); }
|
||||||
|
.fail { color: var(--red); }
|
||||||
|
.stars { letter-spacing: 1px; }
|
||||||
|
.bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
|
||||||
|
.bar-bg { background: var(--border); }
|
||||||
|
.bar-fill { background: var(--green); }
|
||||||
|
.bar-partial { background: var(--yellow); }
|
||||||
|
.model-name { font-weight: 600; }
|
||||||
|
h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
|
||||||
|
.summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>Kipina Model Benchmark</h1>
|
||||||
|
<div class="meta" id="meta"></div>
|
||||||
|
|
||||||
|
<div class="cards" id="cards"></div>
|
||||||
|
|
||||||
|
<h2>Mallikohtainen yhteenveto</h2>
|
||||||
|
<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<h2>Kaikki tulokset</h2>
|
||||||
|
<table id="results-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const RAW = [{"model":"qwen3:8b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":23521,"totalTokens":2090,"avgTokPerSec":100.94324085271073,"promptChars":10962,"promptTokensEst":2741,"score":100,"stars":"★★★★★","error":null},{"model":"qwen3:8b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":1,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":33680,"totalTokens":3003,"avgTokPerSec":100.52754588753601,"promptChars":10171,"promptTokensEst":2543,"score":90,"stars":"★★★★★","error":null},{"model":"qwen3:8b","scenario":"blog","reqOk":true,"specOk":false,"specEntities":0,"validationIssues":0,"fixRounds":0,"testsTotal":0,"testsPassed":0,"testsFailed":0,"totalDurationMs":0,"totalTokens":0,"avgTokPerSec":0,"promptChars":0,"promptTokensEst":0,"score":0,"stars":"","error":"JSON-speksi epäonnistui"}];
|
||||||
|
|
||||||
|
const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
|
||||||
|
function calcScore(r) {
|
||||||
|
if (r.error && r.testsTotal === 0) return 0;
|
||||||
|
let s = 0;
|
||||||
|
if (r.specOk) s += 10;
|
||||||
|
if (!r.error || r.testsTotal > 0) s += 10;
|
||||||
|
if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||||||
|
s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
|
||||||
|
return Math.min(100, s);
|
||||||
|
}
|
||||||
|
// Laske pisteet jos puuttuvat
|
||||||
|
const DATA = RAW.map(r => {
|
||||||
|
if (r.score == null) r.score = calcScore(r);
|
||||||
|
if (!r.stars) r.stars = starsFor(r.score);
|
||||||
|
if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
|
||||||
|
const pctBar = (passed, total, w=80) => {
|
||||||
|
if (total === 0) return '-';
|
||||||
|
const pct = passed/total*100;
|
||||||
|
const c = pct === 100 ? 'bar-fill' : 'bar-partial';
|
||||||
|
return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Meta
|
||||||
|
const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
|
||||||
|
|
||||||
|
// Cards
|
||||||
|
const models = [...new Set(DATA.map(r => r.model))];
|
||||||
|
const scenarios = [...new Set(DATA.map(r => r.scenario))];
|
||||||
|
const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
|
||||||
|
const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
|
||||||
|
const bestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.avg - a.avg)[0];
|
||||||
|
const fastestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.speed - a.speed)[0];
|
||||||
|
|
||||||
|
document.getElementById('cards').innerHTML = `
|
||||||
|
<div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
|
||||||
|
<div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
|
||||||
|
<div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
|
||||||
|
<div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
|
||||||
|
<div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
|
||||||
|
<div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
|
||||||
|
`;
|
||||||
|
|
||||||
|
// Summary table
|
||||||
|
const sumHead = document.querySelector('#summary-table thead');
|
||||||
|
const sumBody = document.querySelector('#summary-table tbody');
|
||||||
|
sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
|
||||||
|
|
||||||
|
const modelRows = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
|
||||||
|
const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
|
||||||
|
const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
|
||||||
|
const scenCols = scenarios.map(s => {
|
||||||
|
const r = mrs.find(r => r.scenario === s);
|
||||||
|
if (!r) return '<td>-</td>';
|
||||||
|
return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
|
||||||
|
}).join('');
|
||||||
|
return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
|
||||||
|
}).sort((a,b) => b.avg - a.avg);
|
||||||
|
sumBody.innerHTML = modelRows.map(r => r.html).join('');
|
||||||
|
|
||||||
|
// Results table
|
||||||
|
const resHead = document.querySelector('#results-table thead');
|
||||||
|
const resBody = document.querySelector('#results-table tbody');
|
||||||
|
const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
|
||||||
|
resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
|
||||||
|
|
||||||
|
let sortCol = 9, sortAsc = false;
|
||||||
|
function renderResults() {
|
||||||
|
const sorted = [...DATA].sort((a,b) => {
|
||||||
|
const vals = [
|
||||||
|
[a.model, b.model],
|
||||||
|
[a.scenario, b.scenario],
|
||||||
|
[a.specEntities, b.specEntities],
|
||||||
|
[a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
|
||||||
|
[a.fixRounds, b.fixRounds],
|
||||||
|
[a.promptTokensEst, b.promptTokensEst],
|
||||||
|
[a.totalTokens, b.totalTokens],
|
||||||
|
[a.totalDurationMs, b.totalDurationMs],
|
||||||
|
[a.avgTokPerSec, b.avgTokPerSec],
|
||||||
|
[a.score, b.score],
|
||||||
|
][sortCol];
|
||||||
|
const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
|
||||||
|
return sortAsc ? cmp : -cmp;
|
||||||
|
});
|
||||||
|
resBody.innerHTML = sorted.map(r => {
|
||||||
|
const c = cls(r);
|
||||||
|
return `<tr>
|
||||||
|
<td class="model-name">${r.model}</td>
|
||||||
|
<td>${r.scenario}</td>
|
||||||
|
<td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
|
||||||
|
<td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
|
||||||
|
<td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
|
||||||
|
<td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
|
||||||
|
<td>${r.avgTokPerSec.toFixed(0)}</td>
|
||||||
|
<td><span class="stars">${r.stars}</span> ${r.score}p</td>
|
||||||
|
</tr>`;
|
||||||
|
}).join('');
|
||||||
|
document.querySelectorAll('#results-table th').forEach((th,i) => {
|
||||||
|
th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.querySelector('#results-table thead').addEventListener('click', e => {
|
||||||
|
const col = parseInt(e.target.dataset.col);
|
||||||
|
if (isNaN(col)) return;
|
||||||
|
if (sortCol === col) sortAsc = !sortAsc;
|
||||||
|
else { sortCol = col; sortAsc = false; }
|
||||||
|
renderResults();
|
||||||
|
});
|
||||||
|
renderResults();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
62
kipina-codebench/results/2026-04-14T09-44.json
Normal file
62
kipina-codebench/results/2026-04-14T09-44.json
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 23521,
|
||||||
|
"totalTokens": 2090,
|
||||||
|
"avgTokPerSec": 100.94324085271073,
|
||||||
|
"promptChars": 10962,
|
||||||
|
"promptTokensEst": 2741,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 1,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 33680,
|
||||||
|
"totalTokens": 3003,
|
||||||
|
"avgTokPerSec": 100.52754588753601,
|
||||||
|
"promptChars": 10171,
|
||||||
|
"promptTokensEst": 2543,
|
||||||
|
"score": 90,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui"
|
||||||
|
}
|
||||||
|
]
|
||||||
183
kipina-codebench/results/2026-04-14T09-47.html
Normal file
183
kipina-codebench/results/2026-04-14T09-47.html
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="fi">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Kipina Model Benchmark</title>
|
||||||
|
<style>
|
||||||
|
:root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
|
||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
|
||||||
|
h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
|
||||||
|
.meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
|
||||||
|
.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
||||||
|
.card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
|
||||||
|
.card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
|
||||||
|
.card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
|
||||||
|
.card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
|
||||||
|
table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
|
||||||
|
th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
|
||||||
|
th:hover { color: var(--text); }
|
||||||
|
th.sorted-asc::after { content: ' ▲'; }
|
||||||
|
th.sorted-desc::after { content: ' ▼'; }
|
||||||
|
td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
|
||||||
|
tr:hover td { background: #1c2128; }
|
||||||
|
.pass { color: var(--green); }
|
||||||
|
.partial { color: var(--yellow); }
|
||||||
|
.fail { color: var(--red); }
|
||||||
|
.stars { letter-spacing: 1px; }
|
||||||
|
.bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
|
||||||
|
.bar-bg { background: var(--border); }
|
||||||
|
.bar-fill { background: var(--green); }
|
||||||
|
.bar-partial { background: var(--yellow); }
|
||||||
|
.model-name { font-weight: 600; }
|
||||||
|
h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
|
||||||
|
.summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>Kipina Model Benchmark</h1>
|
||||||
|
<div class="meta" id="meta"></div>
|
||||||
|
|
||||||
|
<div class="cards" id="cards"></div>
|
||||||
|
|
||||||
|
<h2>Mallikohtainen yhteenveto</h2>
|
||||||
|
<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<h2>Kaikki tulokset</h2>
|
||||||
|
<table id="results-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const RAW = [{"model":"qwen3:8b","scenario":"todo","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":3,"testsTotal":8,"testsPassed":6,"testsFailed":2,"totalDurationMs":97470,"totalTokens":8786,"avgTokPerSec":97.96636139685832,"promptChars":11290,"promptTokensEst":2823,"score":65,"stars":"★★★☆☆","error":null},{"model":"qwen3:8b","scenario":"users","reqOk":true,"specOk":true,"specEntities":1,"validationIssues":0,"fixRounds":0,"testsTotal":6,"testsPassed":6,"testsFailed":0,"totalDurationMs":18951,"totalTokens":1666,"avgTokPerSec":101.807593927545,"promptChars":10293,"promptTokensEst":2573,"score":100,"stars":"★★★★★","error":null},{"model":"qwen3:8b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":3,"testsTotal":1,"testsPassed":0,"testsFailed":1,"totalDurationMs":126005,"totalTokens":11056,"avgTokPerSec":96.6373549161171,"promptChars":11878,"promptTokensEst":2970,"score":20,"stars":"★☆☆☆☆","error":"Syntaksivirhe"}];
|
||||||
|
|
||||||
|
const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
|
||||||
|
function calcScore(r) {
|
||||||
|
if (r.error && r.testsTotal === 0) return 0;
|
||||||
|
let s = 0;
|
||||||
|
if (r.specOk) s += 10;
|
||||||
|
if (!r.error || r.testsTotal > 0) s += 10;
|
||||||
|
if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||||||
|
s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
|
||||||
|
return Math.min(100, s);
|
||||||
|
}
|
||||||
|
// Laske pisteet jos puuttuvat
|
||||||
|
const DATA = RAW.map(r => {
|
||||||
|
if (r.score == null) r.score = calcScore(r);
|
||||||
|
if (!r.stars) r.stars = starsFor(r.score);
|
||||||
|
if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
|
||||||
|
const pctBar = (passed, total, w=80) => {
|
||||||
|
if (total === 0) return '-';
|
||||||
|
const pct = passed/total*100;
|
||||||
|
const c = pct === 100 ? 'bar-fill' : 'bar-partial';
|
||||||
|
return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Meta
|
||||||
|
const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
|
||||||
|
|
||||||
|
// Cards
|
||||||
|
const models = [...new Set(DATA.map(r => r.model))];
|
||||||
|
const scenarios = [...new Set(DATA.map(r => r.scenario))];
|
||||||
|
const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
|
||||||
|
const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
|
||||||
|
const bestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.avg - a.avg)[0];
|
||||||
|
const fastestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.speed - a.speed)[0];
|
||||||
|
|
||||||
|
document.getElementById('cards').innerHTML = `
|
||||||
|
<div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
|
||||||
|
<div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
|
||||||
|
<div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
|
||||||
|
<div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
|
||||||
|
<div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
|
||||||
|
<div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
|
||||||
|
`;
|
||||||
|
|
||||||
|
// Summary table
|
||||||
|
const sumHead = document.querySelector('#summary-table thead');
|
||||||
|
const sumBody = document.querySelector('#summary-table tbody');
|
||||||
|
sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
|
||||||
|
|
||||||
|
const modelRows = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
|
||||||
|
const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
|
||||||
|
const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
|
||||||
|
const scenCols = scenarios.map(s => {
|
||||||
|
const r = mrs.find(r => r.scenario === s);
|
||||||
|
if (!r) return '<td>-</td>';
|
||||||
|
return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
|
||||||
|
}).join('');
|
||||||
|
return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
|
||||||
|
}).sort((a,b) => b.avg - a.avg);
|
||||||
|
sumBody.innerHTML = modelRows.map(r => r.html).join('');
|
||||||
|
|
||||||
|
// Results table
|
||||||
|
const resHead = document.querySelector('#results-table thead');
|
||||||
|
const resBody = document.querySelector('#results-table tbody');
|
||||||
|
const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
|
||||||
|
resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
|
||||||
|
|
||||||
|
let sortCol = 9, sortAsc = false;
|
||||||
|
function renderResults() {
|
||||||
|
const sorted = [...DATA].sort((a,b) => {
|
||||||
|
const vals = [
|
||||||
|
[a.model, b.model],
|
||||||
|
[a.scenario, b.scenario],
|
||||||
|
[a.specEntities, b.specEntities],
|
||||||
|
[a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
|
||||||
|
[a.fixRounds, b.fixRounds],
|
||||||
|
[a.promptTokensEst, b.promptTokensEst],
|
||||||
|
[a.totalTokens, b.totalTokens],
|
||||||
|
[a.totalDurationMs, b.totalDurationMs],
|
||||||
|
[a.avgTokPerSec, b.avgTokPerSec],
|
||||||
|
[a.score, b.score],
|
||||||
|
][sortCol];
|
||||||
|
const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
|
||||||
|
return sortAsc ? cmp : -cmp;
|
||||||
|
});
|
||||||
|
resBody.innerHTML = sorted.map(r => {
|
||||||
|
const c = cls(r);
|
||||||
|
return `<tr>
|
||||||
|
<td class="model-name">${r.model}</td>
|
||||||
|
<td>${r.scenario}</td>
|
||||||
|
<td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
|
||||||
|
<td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
|
||||||
|
<td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
|
||||||
|
<td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
|
||||||
|
<td>${r.avgTokPerSec.toFixed(0)}</td>
|
||||||
|
<td><span class="stars">${r.stars}</span> ${r.score}p</td>
|
||||||
|
</tr>`;
|
||||||
|
}).join('');
|
||||||
|
document.querySelectorAll('#results-table th').forEach((th,i) => {
|
||||||
|
th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.querySelector('#results-table thead').addEventListener('click', e => {
|
||||||
|
const col = parseInt(e.target.dataset.col);
|
||||||
|
if (isNaN(col)) return;
|
||||||
|
if (sortCol === col) sortAsc = !sortAsc;
|
||||||
|
else { sortCol = col; sortAsc = false; }
|
||||||
|
renderResults();
|
||||||
|
});
|
||||||
|
renderResults();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
62
kipina-codebench/results/2026-04-14T09-47.json
Normal file
62
kipina-codebench/results/2026-04-14T09-47.json
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 2,
|
||||||
|
"totalDurationMs": 97470,
|
||||||
|
"totalTokens": 8786,
|
||||||
|
"avgTokPerSec": 97.96636139685832,
|
||||||
|
"promptChars": 11290,
|
||||||
|
"promptTokensEst": 2823,
|
||||||
|
"score": 65,
|
||||||
|
"stars": "★★★☆☆",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 18951,
|
||||||
|
"totalTokens": 1666,
|
||||||
|
"avgTokPerSec": 101.807593927545,
|
||||||
|
"promptChars": 10293,
|
||||||
|
"promptTokensEst": 2573,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 126005,
|
||||||
|
"totalTokens": 11056,
|
||||||
|
"avgTokPerSec": 96.6373549161171,
|
||||||
|
"promptChars": 11878,
|
||||||
|
"promptTokensEst": 2970,
|
||||||
|
"score": 20,
|
||||||
|
"stars": "★☆☆☆☆",
|
||||||
|
"error": "Syntaksivirhe"
|
||||||
|
}
|
||||||
|
]
|
||||||
183
kipina-codebench/results/2026-04-14T09-52.html
Normal file
183
kipina-codebench/results/2026-04-14T09-52.html
Normal file
File diff suppressed because one or more lines are too long
947
kipina-codebench/results/2026-04-14T09-52.json
Normal file
947
kipina-codebench/results/2026-04-14T09-52.json
Normal file
@@ -0,0 +1,947 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 25444,
|
||||||
|
"totalTokens": 2661,
|
||||||
|
"avgTokPerSec": 122.06801173056196,
|
||||||
|
"promptChars": 11849,
|
||||||
|
"promptTokensEst": 2962,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 24447,
|
||||||
|
"totalTokens": 2537,
|
||||||
|
"avgTokPerSec": 121.11837170891442,
|
||||||
|
"promptChars": 11045,
|
||||||
|
"promptTokensEst": 2761,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 11,
|
||||||
|
"testsPassed": 11,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 38071,
|
||||||
|
"totalTokens": 3965,
|
||||||
|
"avgTokPerSec": 120.37309655579647,
|
||||||
|
"promptChars": 12702,
|
||||||
|
"promptTokensEst": 3176,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 38459,
|
||||||
|
"totalTokens": 2106,
|
||||||
|
"avgTokPerSec": 60.889088461567745,
|
||||||
|
"promptChars": 10951,
|
||||||
|
"promptTokensEst": 2738,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 35959,
|
||||||
|
"totalTokens": 1966,
|
||||||
|
"avgTokPerSec": 60.9684885562545,
|
||||||
|
"promptChars": 10698,
|
||||||
|
"promptTokensEst": 2675,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 13,
|
||||||
|
"testsPassed": 2,
|
||||||
|
"testsFailed": 11,
|
||||||
|
"totalDurationMs": 269370,
|
||||||
|
"totalTokens": 14361,
|
||||||
|
"avgTokPerSec": 57.79069860126629,
|
||||||
|
"promptChars": 11838,
|
||||||
|
"promptTokensEst": 2960,
|
||||||
|
"score": 29,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 23199,
|
||||||
|
"totalTokens": 2054,
|
||||||
|
"avgTokPerSec": 101.09280595816365,
|
||||||
|
"promptChars": 10854,
|
||||||
|
"promptTokensEst": 2714,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 1,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 72665,
|
||||||
|
"totalTokens": 6586,
|
||||||
|
"avgTokPerSec": 99.40636298490288,
|
||||||
|
"promptChars": 10157,
|
||||||
|
"promptTokensEst": 2539,
|
||||||
|
"score": 20,
|
||||||
|
"stars": "★☆☆☆☆",
|
||||||
|
"error": "Syntaksivirhe",
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 136309,
|
||||||
|
"totalTokens": 12036,
|
||||||
|
"avgTokPerSec": 97.02525169408467,
|
||||||
|
"promptChars": 10823,
|
||||||
|
"promptTokensEst": 2706,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "☆☆☆☆☆",
|
||||||
|
"error": "Testit kaatuivat",
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 28177,
|
||||||
|
"totalTokens": 2946,
|
||||||
|
"avgTokPerSec": 121.23541038097,
|
||||||
|
"promptChars": 11836,
|
||||||
|
"promptTokensEst": 2959,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 8,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 22631,
|
||||||
|
"totalTokens": 2352,
|
||||||
|
"avgTokPerSec": 121.93930190168658,
|
||||||
|
"promptChars": 10440,
|
||||||
|
"promptTokensEst": 2610,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 12,
|
||||||
|
"testsPassed": 12,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 40394,
|
||||||
|
"totalTokens": 4225,
|
||||||
|
"avgTokPerSec": 120.84107397324551,
|
||||||
|
"promptChars": 12362,
|
||||||
|
"promptTokensEst": 3091,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 46081,
|
||||||
|
"totalTokens": 2542,
|
||||||
|
"avgTokPerSec": 60.93046828700026,
|
||||||
|
"promptChars": 11412,
|
||||||
|
"promptTokensEst": 2853,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 41323,
|
||||||
|
"totalTokens": 2272,
|
||||||
|
"avgTokPerSec": 60.99406174164295,
|
||||||
|
"promptChars": 10884,
|
||||||
|
"promptTokensEst": 2721,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 14,
|
||||||
|
"testsPassed": 2,
|
||||||
|
"testsFailed": 12,
|
||||||
|
"totalDurationMs": 262591,
|
||||||
|
"totalTokens": 14129,
|
||||||
|
"avgTokPerSec": 57.91340837830759,
|
||||||
|
"promptChars": 12143,
|
||||||
|
"promptTokensEst": 3036,
|
||||||
|
"score": 29,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 24007,
|
||||||
|
"totalTokens": 2137,
|
||||||
|
"avgTokPerSec": 101.05982103292858,
|
||||||
|
"promptChars": 10756,
|
||||||
|
"promptTokensEst": 2689,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 68739,
|
||||||
|
"totalTokens": 6199,
|
||||||
|
"avgTokPerSec": 98.9825675198183,
|
||||||
|
"promptChars": 10313,
|
||||||
|
"promptTokensEst": 2578,
|
||||||
|
"score": 71,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui",
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 23472,
|
||||||
|
"totalTokens": 2427,
|
||||||
|
"avgTokPerSec": 120.85293828875076,
|
||||||
|
"promptChars": 11663,
|
||||||
|
"promptTokensEst": 2916,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 8,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 25864,
|
||||||
|
"totalTokens": 2671,
|
||||||
|
"avgTokPerSec": 120.6883137195962,
|
||||||
|
"promptChars": 11148,
|
||||||
|
"promptTokensEst": 2787,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 12,
|
||||||
|
"testsPassed": 12,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 41074,
|
||||||
|
"totalTokens": 4275,
|
||||||
|
"avgTokPerSec": 120.33351485161673,
|
||||||
|
"promptChars": 12664,
|
||||||
|
"promptTokensEst": 3166,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 40457,
|
||||||
|
"totalTokens": 2229,
|
||||||
|
"avgTokPerSec": 61.093615619948345,
|
||||||
|
"promptChars": 10905,
|
||||||
|
"promptTokensEst": 2726,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 1,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 77506,
|
||||||
|
"totalTokens": 4268,
|
||||||
|
"avgTokPerSec": 60.19655522627278,
|
||||||
|
"promptChars": 11135,
|
||||||
|
"promptTokensEst": 2784,
|
||||||
|
"score": 90,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 12,
|
||||||
|
"testsPassed": 12,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 74791,
|
||||||
|
"totalTokens": 3590,
|
||||||
|
"avgTokPerSec": 60.549298891176214,
|
||||||
|
"promptChars": 11653,
|
||||||
|
"promptTokensEst": 2913,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 26402,
|
||||||
|
"totalTokens": 2358,
|
||||||
|
"avgTokPerSec": 100.76936895480246,
|
||||||
|
"promptChars": 11243,
|
||||||
|
"promptTokensEst": 2811,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 20751,
|
||||||
|
"totalTokens": 1837,
|
||||||
|
"avgTokPerSec": 101.05480893032836,
|
||||||
|
"promptChars": 10553,
|
||||||
|
"promptTokensEst": 2638,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui",
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 22098,
|
||||||
|
"totalTokens": 2283,
|
||||||
|
"avgTokPerSec": 121.81254413612446,
|
||||||
|
"promptChars": 11503,
|
||||||
|
"promptTokensEst": 2876,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 2,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 8,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 65403,
|
||||||
|
"totalTokens": 6779,
|
||||||
|
"avgTokPerSec": 118.13288294758586,
|
||||||
|
"promptChars": 10939,
|
||||||
|
"promptTokensEst": 2735,
|
||||||
|
"score": 80,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 10,
|
||||||
|
"testsPassed": 10,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 36044,
|
||||||
|
"totalTokens": 3748,
|
||||||
|
"avgTokPerSec": 120.14822967005487,
|
||||||
|
"promptChars": 12639,
|
||||||
|
"promptTokensEst": 3160,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 38501,
|
||||||
|
"totalTokens": 2113,
|
||||||
|
"avgTokPerSec": 61.01814139430428,
|
||||||
|
"promptChars": 10929,
|
||||||
|
"promptTokensEst": 2732,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 1,
|
||||||
|
"testsFailed": 7,
|
||||||
|
"totalDurationMs": 147057,
|
||||||
|
"totalTokens": 7799,
|
||||||
|
"avgTokPerSec": 56.209406465865904,
|
||||||
|
"promptChars": 11207,
|
||||||
|
"promptTokensEst": 2802,
|
||||||
|
"score": 28,
|
||||||
|
"stars": "★★☆☆☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 12,
|
||||||
|
"testsPassed": 12,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 227508,
|
||||||
|
"totalTokens": 12026,
|
||||||
|
"avgTokPerSec": 58.52888492610325,
|
||||||
|
"promptChars": 11809,
|
||||||
|
"promptTokensEst": 2952,
|
||||||
|
"score": 80,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 11,
|
||||||
|
"testsPassed": 11,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 131964,
|
||||||
|
"totalTokens": 11403,
|
||||||
|
"avgTokPerSec": 97.10963264920952,
|
||||||
|
"promptChars": 11786,
|
||||||
|
"promptTokensEst": 2947,
|
||||||
|
"score": 80,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 38820,
|
||||||
|
"totalTokens": 1826,
|
||||||
|
"avgTokPerSec": 101.07773707712924,
|
||||||
|
"promptChars": 10568,
|
||||||
|
"promptTokensEst": 2642,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui",
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 1,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 39797,
|
||||||
|
"totalTokens": 3776,
|
||||||
|
"avgTokPerSec": 120.91801837211113,
|
||||||
|
"promptChars": 11435,
|
||||||
|
"promptTokensEst": 2859,
|
||||||
|
"score": 90,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 9,
|
||||||
|
"testsPassed": 8,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 87836,
|
||||||
|
"totalTokens": 9343,
|
||||||
|
"avgTokPerSec": 119.28783662683314,
|
||||||
|
"promptChars": 10718,
|
||||||
|
"promptTokensEst": 2680,
|
||||||
|
"score": 73,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3-coder:30b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 10,
|
||||||
|
"testsPassed": 10,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 36644,
|
||||||
|
"totalTokens": 3897,
|
||||||
|
"avgTokPerSec": 122.28607796191666,
|
||||||
|
"promptChars": 12598,
|
||||||
|
"promptTokensEst": 3150,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 1,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 127532,
|
||||||
|
"totalTokens": 3919,
|
||||||
|
"avgTokPerSec": 34.13133325491828,
|
||||||
|
"promptChars": 11352,
|
||||||
|
"promptTokensEst": 2838,
|
||||||
|
"score": 90,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 2,
|
||||||
|
"totalDurationMs": 217365,
|
||||||
|
"totalTokens": 7764,
|
||||||
|
"avgTokPerSec": 38.67613170588518,
|
||||||
|
"promptChars": 10834,
|
||||||
|
"promptTokensEst": 2709,
|
||||||
|
"score": 65,
|
||||||
|
"stars": "★★★☆☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:14b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 14,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 7,
|
||||||
|
"totalDurationMs": 248311,
|
||||||
|
"totalTokens": 13443,
|
||||||
|
"avgTokPerSec": 58.05680015263308,
|
||||||
|
"promptChars": 12219,
|
||||||
|
"promptTokensEst": 3055,
|
||||||
|
"score": 50,
|
||||||
|
"stars": "★★★☆☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 38326,
|
||||||
|
"totalTokens": 2079,
|
||||||
|
"avgTokPerSec": 100.89778087504016,
|
||||||
|
"promptChars": 10908,
|
||||||
|
"promptTokensEst": 2727,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 60823,
|
||||||
|
"totalTokens": 1772,
|
||||||
|
"avgTokPerSec": 96.76383996716295,
|
||||||
|
"promptChars": 10378,
|
||||||
|
"promptTokensEst": 2595,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 11,
|
||||||
|
"testsPassed": 11,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 81654,
|
||||||
|
"totalTokens": 3458,
|
||||||
|
"avgTokPerSec": 95.65675360193613,
|
||||||
|
"promptChars": 11914,
|
||||||
|
"promptTokensEst": 2979,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
}
|
||||||
|
]
|
||||||
183
kipina-codebench/results/2026-04-14T10-03.html
Normal file
183
kipina-codebench/results/2026-04-14T10-03.html
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="fi">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Kipina Model Benchmark</title>
|
||||||
|
<style>
|
||||||
|
:root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
|
||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
|
||||||
|
h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
|
||||||
|
.meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
|
||||||
|
.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
||||||
|
.card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
|
||||||
|
.card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
|
||||||
|
.card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
|
||||||
|
.card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
|
||||||
|
table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
|
||||||
|
th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
|
||||||
|
th:hover { color: var(--text); }
|
||||||
|
th.sorted-asc::after { content: ' ▲'; }
|
||||||
|
th.sorted-desc::after { content: ' ▼'; }
|
||||||
|
td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
|
||||||
|
tr:hover td { background: #1c2128; }
|
||||||
|
.pass { color: var(--green); }
|
||||||
|
.partial { color: var(--yellow); }
|
||||||
|
.fail { color: var(--red); }
|
||||||
|
.stars { letter-spacing: 1px; }
|
||||||
|
.bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
|
||||||
|
.bar-bg { background: var(--border); }
|
||||||
|
.bar-fill { background: var(--green); }
|
||||||
|
.bar-partial { background: var(--yellow); }
|
||||||
|
.model-name { font-weight: 600; }
|
||||||
|
h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
|
||||||
|
.summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>Kipina Model Benchmark</h1>
|
||||||
|
<div class="meta" id="meta"></div>
|
||||||
|
|
||||||
|
<div class="cards" id="cards"></div>
|
||||||
|
|
||||||
|
<h2>Mallikohtainen yhteenveto</h2>
|
||||||
|
<table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<h2>Kaikki tulokset</h2>
|
||||||
|
<table id="results-table"><thead></thead><tbody></tbody></table>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const RAW = [];
|
||||||
|
|
||||||
|
const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
|
||||||
|
function calcScore(r) {
|
||||||
|
if (r.error && r.testsTotal === 0) return 0;
|
||||||
|
let s = 0;
|
||||||
|
if (r.specOk) s += 10;
|
||||||
|
if (!r.error || r.testsTotal > 0) s += 10;
|
||||||
|
if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||||||
|
s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
|
||||||
|
return Math.min(100, s);
|
||||||
|
}
|
||||||
|
// Laske pisteet jos puuttuvat
|
||||||
|
const DATA = RAW.map(r => {
|
||||||
|
if (r.score == null) r.score = calcScore(r);
|
||||||
|
if (!r.stars) r.stars = starsFor(r.score);
|
||||||
|
if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
|
||||||
|
const pctBar = (passed, total, w=80) => {
|
||||||
|
if (total === 0) return '-';
|
||||||
|
const pct = passed/total*100;
|
||||||
|
const c = pct === 100 ? 'bar-fill' : 'bar-partial';
|
||||||
|
return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Meta
|
||||||
|
const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
|
||||||
|
|
||||||
|
// Cards
|
||||||
|
const models = [...new Set(DATA.map(r => r.model))];
|
||||||
|
const scenarios = [...new Set(DATA.map(r => r.scenario))];
|
||||||
|
const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
|
||||||
|
const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
|
||||||
|
const bestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.avg - a.avg)[0];
|
||||||
|
const fastestModel = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
|
||||||
|
}).sort((a,b) => b.speed - a.speed)[0];
|
||||||
|
|
||||||
|
document.getElementById('cards').innerHTML = `
|
||||||
|
<div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
|
||||||
|
<div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
|
||||||
|
<div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
|
||||||
|
<div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
|
||||||
|
<div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
|
||||||
|
<div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
|
||||||
|
`;
|
||||||
|
|
||||||
|
// Summary table
|
||||||
|
const sumHead = document.querySelector('#summary-table thead');
|
||||||
|
const sumBody = document.querySelector('#summary-table tbody');
|
||||||
|
sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
|
||||||
|
|
||||||
|
const modelRows = models.map(m => {
|
||||||
|
const mrs = DATA.filter(r => r.model === m);
|
||||||
|
const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
|
||||||
|
const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
|
||||||
|
const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
|
||||||
|
const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
|
||||||
|
const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
|
||||||
|
const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
|
||||||
|
const scenCols = scenarios.map(s => {
|
||||||
|
const r = mrs.find(r => r.scenario === s);
|
||||||
|
if (!r) return '<td>-</td>';
|
||||||
|
return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
|
||||||
|
}).join('');
|
||||||
|
return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
|
||||||
|
}).sort((a,b) => b.avg - a.avg);
|
||||||
|
sumBody.innerHTML = modelRows.map(r => r.html).join('');
|
||||||
|
|
||||||
|
// Results table
|
||||||
|
const resHead = document.querySelector('#results-table thead');
|
||||||
|
const resBody = document.querySelector('#results-table tbody');
|
||||||
|
const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
|
||||||
|
resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
|
||||||
|
|
||||||
|
let sortCol = 9, sortAsc = false;
|
||||||
|
function renderResults() {
|
||||||
|
const sorted = [...DATA].sort((a,b) => {
|
||||||
|
const vals = [
|
||||||
|
[a.model, b.model],
|
||||||
|
[a.scenario, b.scenario],
|
||||||
|
[a.specEntities, b.specEntities],
|
||||||
|
[a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
|
||||||
|
[a.fixRounds, b.fixRounds],
|
||||||
|
[a.promptTokensEst, b.promptTokensEst],
|
||||||
|
[a.totalTokens, b.totalTokens],
|
||||||
|
[a.totalDurationMs, b.totalDurationMs],
|
||||||
|
[a.avgTokPerSec, b.avgTokPerSec],
|
||||||
|
[a.score, b.score],
|
||||||
|
][sortCol];
|
||||||
|
const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
|
||||||
|
return sortAsc ? cmp : -cmp;
|
||||||
|
});
|
||||||
|
resBody.innerHTML = sorted.map(r => {
|
||||||
|
const c = cls(r);
|
||||||
|
return `<tr>
|
||||||
|
<td class="model-name">${r.model}</td>
|
||||||
|
<td>${r.scenario}</td>
|
||||||
|
<td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
|
||||||
|
<td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
|
||||||
|
<td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
|
||||||
|
<td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
|
||||||
|
<td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
|
||||||
|
<td>${r.avgTokPerSec.toFixed(0)}</td>
|
||||||
|
<td><span class="stars">${r.stars}</span> ${r.score}p</td>
|
||||||
|
</tr>`;
|
||||||
|
}).join('');
|
||||||
|
document.querySelectorAll('#results-table th').forEach((th,i) => {
|
||||||
|
th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.querySelector('#results-table thead').addEventListener('click', e => {
|
||||||
|
const col = parseInt(e.target.dataset.col);
|
||||||
|
if (isNaN(col)) return;
|
||||||
|
if (sortCol === col) sortAsc = !sortAsc;
|
||||||
|
else { sortCol = col; sortAsc = false; }
|
||||||
|
renderResults();
|
||||||
|
});
|
||||||
|
renderResults();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
1
kipina-codebench/results/2026-04-14T10-03.json
Normal file
1
kipina-codebench/results/2026-04-14T10-03.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
183
kipina-codebench/results/2026-04-14T10-31.html
Normal file
183
kipina-codebench/results/2026-04-14T10-31.html
Normal file
File diff suppressed because one or more lines are too long
317
kipina-codebench/results/2026-04-14T10-31.json
Normal file
317
kipina-codebench/results/2026-04-14T10-31.json
Normal file
@@ -0,0 +1,317 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 97527,
|
||||||
|
"totalTokens": 2228,
|
||||||
|
"avgTokPerSec": 100.69171830800946,
|
||||||
|
"promptChars": 11566,
|
||||||
|
"promptTokensEst": 2892,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 39549,
|
||||||
|
"totalTokens": 1960,
|
||||||
|
"avgTokPerSec": 100.98265593129491,
|
||||||
|
"promptChars": 11073,
|
||||||
|
"promptTokensEst": 2768,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui",
|
||||||
|
"round": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 131339,
|
||||||
|
"totalTokens": 11518,
|
||||||
|
"avgTokPerSec": 96.52358107464266,
|
||||||
|
"promptChars": 12388,
|
||||||
|
"promptTokensEst": 3097,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "☆☆☆☆☆",
|
||||||
|
"error": "Testit kaatuivat",
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 20658,
|
||||||
|
"totalTokens": 1808,
|
||||||
|
"avgTokPerSec": 101.0081173861862,
|
||||||
|
"promptChars": 11057,
|
||||||
|
"promptTokensEst": 2764,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui",
|
||||||
|
"round": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 1,
|
||||||
|
"fixRounds": 5,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 320031,
|
||||||
|
"totalTokens": 11985,
|
||||||
|
"avgTokPerSec": 54.915025374575386,
|
||||||
|
"promptChars": 12517,
|
||||||
|
"promptTokensEst": 3129,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "☆☆☆☆☆",
|
||||||
|
"error": "Testit kaatuivat",
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 7,
|
||||||
|
"testsPassed": 7,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 28654,
|
||||||
|
"totalTokens": 1877,
|
||||||
|
"avgTokPerSec": 100.70920643946336,
|
||||||
|
"promptChars": 10747,
|
||||||
|
"promptTokensEst": 2687,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": false,
|
||||||
|
"specEntities": 0,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 0,
|
||||||
|
"testsPassed": 0,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 0,
|
||||||
|
"totalTokens": 0,
|
||||||
|
"avgTokPerSec": 0,
|
||||||
|
"promptChars": 0,
|
||||||
|
"promptTokensEst": 0,
|
||||||
|
"score": 0,
|
||||||
|
"stars": "",
|
||||||
|
"error": "JSON-speksi epäonnistui",
|
||||||
|
"round": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 1,
|
||||||
|
"testsTotal": 12,
|
||||||
|
"testsPassed": 12,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 67943,
|
||||||
|
"totalTokens": 6002,
|
||||||
|
"avgTokPerSec": 98.29436788902672,
|
||||||
|
"promptChars": 12389,
|
||||||
|
"promptTokensEst": 3097,
|
||||||
|
"score": 90,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 20203,
|
||||||
|
"totalTokens": 1774,
|
||||||
|
"avgTokPerSec": 100.9066297884274,
|
||||||
|
"promptChars": 10905,
|
||||||
|
"promptTokensEst": 2726,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 13,
|
||||||
|
"testsPassed": 12,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 148491,
|
||||||
|
"totalTokens": 12747,
|
||||||
|
"avgTokPerSec": 95.18237885727869,
|
||||||
|
"promptChars": 12476,
|
||||||
|
"promptTokensEst": 3119,
|
||||||
|
"score": 75,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "todo",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 0,
|
||||||
|
"testsTotal": 6,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 0,
|
||||||
|
"totalDurationMs": 23830,
|
||||||
|
"totalTokens": 2102,
|
||||||
|
"avgTokPerSec": 100.641489789061,
|
||||||
|
"promptChars": 11404,
|
||||||
|
"promptTokensEst": 2851,
|
||||||
|
"score": 100,
|
||||||
|
"stars": "★★★★★",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "users",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 1,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 8,
|
||||||
|
"testsPassed": 6,
|
||||||
|
"testsFailed": 2,
|
||||||
|
"totalDurationMs": 122453,
|
||||||
|
"totalTokens": 7285,
|
||||||
|
"avgTokPerSec": 94.12482830400619,
|
||||||
|
"promptChars": 11400,
|
||||||
|
"promptTokensEst": 2850,
|
||||||
|
"score": 65,
|
||||||
|
"stars": "★★★☆☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "qwen3:8b",
|
||||||
|
"scenario": "blog",
|
||||||
|
"reqOk": true,
|
||||||
|
"specOk": true,
|
||||||
|
"specEntities": 2,
|
||||||
|
"validationIssues": 0,
|
||||||
|
"fixRounds": 3,
|
||||||
|
"testsTotal": 11,
|
||||||
|
"testsPassed": 10,
|
||||||
|
"testsFailed": 1,
|
||||||
|
"totalDurationMs": 147125,
|
||||||
|
"totalTokens": 9893,
|
||||||
|
"avgTokPerSec": 97.37021605085566,
|
||||||
|
"promptChars": 12455,
|
||||||
|
"promptTokensEst": 3114,
|
||||||
|
"score": 75,
|
||||||
|
"stars": "★★★★☆",
|
||||||
|
"error": null,
|
||||||
|
"round": 5
|
||||||
|
}
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user