Benchmark: pisteytys (0-100) ja tähtiluokitus tuloksissa
Pisteytys: speksi 10p + koodi 10p + testit 60p + korjaukset 20p. Tähdet: ★★★★★ (90+), ★★★★☆ (70+), ★★★☆☆ (50+), ★★☆☆☆ (25+), ★☆☆☆☆ (1+). Näkyy per-ajo rivillä, tulostaulussa ja yhteenvedossa.
This commit is contained in:
@@ -273,6 +273,7 @@ async function runPipeline(model, scenario) {
|
||||
testsTotal: 0, testsPassed: 0, testsFailed: 0,
|
||||
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
|
||||
promptChars: 0, promptTokensEst: 0,
|
||||
score: 0, stars: '',
|
||||
error: null,
|
||||
};
|
||||
const timings = [];
|
||||
@@ -376,10 +377,35 @@ async function runPipeline(model, scenario) {
|
||||
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
|
||||
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
|
||||
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
|
||||
result.score = scoreResult(result);
|
||||
result.stars = starsForScore(result.score);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// === Pisteytys (0–100) ja tähtiluokitus ===
|
||||
function scoreResult(r) {
|
||||
if (r.error && r.testsTotal === 0) return 0;
|
||||
let score = 0;
|
||||
// Speksi onnistui (10p)
|
||||
if (r.specOk) score += 10;
|
||||
// Koodi generoitu (10p)
|
||||
if (!r.error || r.testsTotal > 0) score += 10;
|
||||
// Testien läpäisy (60p)
|
||||
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||||
// Korjauskierrokset (20p: 0×=20, 1×=10, 2×=0)
|
||||
score += Math.max(0, 20 - r.fixRounds * 10);
|
||||
return Math.min(100, score);
|
||||
}
|
||||
function starsForScore(score) {
|
||||
if (score >= 90) return '★★★★★';
|
||||
if (score >= 70) return '★★★★☆';
|
||||
if (score >= 50) return '★★★☆☆';
|
||||
if (score >= 25) return '★★☆☆☆';
|
||||
if (score > 0) return '★☆☆☆☆';
|
||||
return '☆☆☆☆☆';
|
||||
}
|
||||
|
||||
// === Main ===
|
||||
async function main() {
|
||||
console.log('╔══════════════════════════════════════════════╗');
|
||||
@@ -424,7 +450,7 @@ async function main() {
|
||||
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
|
||||
`◐ ${r.testsPassed}/${r.testsTotal}`;
|
||||
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
|
||||
console.log(` → ${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
||||
console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -442,7 +468,7 @@ async function main() {
|
||||
'Ctx'.padEnd(7),
|
||||
'Aika'.padEnd(8),
|
||||
'tok/s'.padEnd(8),
|
||||
'Tulos',
|
||||
'Pisteet',
|
||||
].join(' │ ');
|
||||
console.log(`║ ${header} ║`);
|
||||
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
|
||||
@@ -454,8 +480,6 @@ async function main() {
|
||||
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
|
||||
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
||||
const speed = `${r.avgTokPerSec.toFixed(0)}`;
|
||||
const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL';
|
||||
|
||||
const row = [
|
||||
r.model.padEnd(40),
|
||||
r.scenario.padEnd(10),
|
||||
@@ -465,7 +489,7 @@ async function main() {
|
||||
ctx.padEnd(7),
|
||||
time.padEnd(8),
|
||||
speed.padEnd(8),
|
||||
verdict,
|
||||
`${r.stars} ${r.score}`,
|
||||
].join(' │ ');
|
||||
console.log(`║ ${row} ║`);
|
||||
}
|
||||
@@ -479,7 +503,8 @@ async function main() {
|
||||
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
|
||||
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
|
||||
const failed = results.filter(r => r.error || r.testsTotal === 0);
|
||||
console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
|
||||
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
|
||||
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error(e); process.exit(1); });
|
||||
|
||||
Reference in New Issue
Block a user