Benchmark: pisteytys (0-100) ja tähtiluokitus tuloksissa

Pisteytys: speksi 10p + koodi 10p + testit 60p + korjaukset 20p.
Tähdet: ★★★★★ (90+), ★★★★☆ (70+), ★★★☆☆ (50+), ★★☆☆☆ (25+), ★☆☆☆☆ (1+).
Näkyy per-ajo rivillä, tulostaulussa ja yhteenvedossa.
This commit is contained in:
2026-04-14 08:10:27 +03:00
parent 4a811e4171
commit 9f2899b83d

View File

@@ -273,6 +273,7 @@ async function runPipeline(model, scenario) {
testsTotal: 0, testsPassed: 0, testsFailed: 0,
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
promptChars: 0, promptTokensEst: 0,
score: 0, stars: '',
error: null,
};
const timings = [];
@@ -376,10 +377,35 @@ async function runPipeline(model, scenario) {
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
result.score = scoreResult(result);
result.stars = starsForScore(result.score);
return result;
}
// === Pisteytys (0100) ja tähtiluokitus ===
function scoreResult(r) {
if (r.error && r.testsTotal === 0) return 0;
let score = 0;
// Speksi onnistui (10p)
if (r.specOk) score += 10;
// Koodi generoitu (10p)
if (!r.error || r.testsTotal > 0) score += 10;
// Testien läpäisy (60p)
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
// Korjauskierrokset (20p: 0×=20, 1×=10, 2×=0)
score += Math.max(0, 20 - r.fixRounds * 10);
return Math.min(100, score);
}
function starsForScore(score) {
if (score >= 90) return '★★★★★';
if (score >= 70) return '★★★★☆';
if (score >= 50) return '★★★☆☆';
if (score >= 25) return '★★☆☆☆';
if (score > 0) return '★☆☆☆☆';
return '☆☆☆☆☆';
}
// === Main ===
async function main() {
console.log('╔══════════════════════════════════════════════╗');
@@ -424,7 +450,7 @@ async function main() {
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `${r.testsPassed}/${r.testsTotal}` :
`${r.testsPassed}/${r.testsTotal}`;
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
console.log(`${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
console.log(`${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
}
}
@@ -442,7 +468,7 @@ async function main() {
'Ctx'.padEnd(7),
'Aika'.padEnd(8),
'tok/s'.padEnd(8),
'Tulos',
'Pisteet',
].join(' │ ');
console.log(`${header}`);
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
@@ -454,8 +480,6 @@ async function main() {
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
const speed = `${r.avgTokPerSec.toFixed(0)}`;
const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL';
const row = [
r.model.padEnd(40),
r.scenario.padEnd(10),
@@ -465,7 +489,7 @@ async function main() {
ctx.padEnd(7),
time.padEnd(8),
speed.padEnd(8),
verdict,
`${r.stars} ${r.score}`,
].join(' │ ');
console.log(`${row}`);
}
@@ -479,7 +503,8 @@ async function main() {
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
const failed = results.filter(r => r.error || r.testsTotal === 0);
console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
}
main().catch(e => { console.error(e); process.exit(1); });