Benchmark: pisteytys (0-100) ja tähtiluokitus tuloksissa
Pisteytys: speksi 10p + koodi 10p + testit 60p + korjaukset 20p. Tähdet: ★★★★★ (90+), ★★★★☆ (70+), ★★★☆☆ (50+), ★★☆☆☆ (25+), ★☆☆☆☆ (1+). Näkyy per-ajo rivillä, tulostaulussa ja yhteenvedossa.
This commit is contained in:
@@ -273,6 +273,7 @@ async function runPipeline(model, scenario) {
|
|||||||
testsTotal: 0, testsPassed: 0, testsFailed: 0,
|
testsTotal: 0, testsPassed: 0, testsFailed: 0,
|
||||||
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
|
totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0,
|
||||||
promptChars: 0, promptTokensEst: 0,
|
promptChars: 0, promptTokensEst: 0,
|
||||||
|
score: 0, stars: '',
|
||||||
error: null,
|
error: null,
|
||||||
};
|
};
|
||||||
const timings = [];
|
const timings = [];
|
||||||
@@ -376,10 +377,35 @@ async function runPipeline(model, scenario) {
|
|||||||
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
|
result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0);
|
||||||
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
|
result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0);
|
||||||
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
|
result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0;
|
||||||
|
result.score = scoreResult(result);
|
||||||
|
result.stars = starsForScore(result.score);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// === Pisteytys (0–100) ja tähtiluokitus ===
|
||||||
|
function scoreResult(r) {
|
||||||
|
if (r.error && r.testsTotal === 0) return 0;
|
||||||
|
let score = 0;
|
||||||
|
// Speksi onnistui (10p)
|
||||||
|
if (r.specOk) score += 10;
|
||||||
|
// Koodi generoitu (10p)
|
||||||
|
if (!r.error || r.testsTotal > 0) score += 10;
|
||||||
|
// Testien läpäisy (60p)
|
||||||
|
if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60);
|
||||||
|
// Korjauskierrokset (20p: 0×=20, 1×=10, 2×=0)
|
||||||
|
score += Math.max(0, 20 - r.fixRounds * 10);
|
||||||
|
return Math.min(100, score);
|
||||||
|
}
|
||||||
|
function starsForScore(score) {
|
||||||
|
if (score >= 90) return '★★★★★';
|
||||||
|
if (score >= 70) return '★★★★☆';
|
||||||
|
if (score >= 50) return '★★★☆☆';
|
||||||
|
if (score >= 25) return '★★☆☆☆';
|
||||||
|
if (score > 0) return '★☆☆☆☆';
|
||||||
|
return '☆☆☆☆☆';
|
||||||
|
}
|
||||||
|
|
||||||
// === Main ===
|
// === Main ===
|
||||||
async function main() {
|
async function main() {
|
||||||
console.log('╔══════════════════════════════════════════════╗');
|
console.log('╔══════════════════════════════════════════════╗');
|
||||||
@@ -424,7 +450,7 @@ async function main() {
|
|||||||
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
|
r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` :
|
||||||
`◐ ${r.testsPassed}/${r.testsTotal}`;
|
`◐ ${r.testsPassed}/${r.testsTotal}`;
|
||||||
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
|
const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : '';
|
||||||
console.log(` → ${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -442,7 +468,7 @@ async function main() {
|
|||||||
'Ctx'.padEnd(7),
|
'Ctx'.padEnd(7),
|
||||||
'Aika'.padEnd(8),
|
'Aika'.padEnd(8),
|
||||||
'tok/s'.padEnd(8),
|
'tok/s'.padEnd(8),
|
||||||
'Tulos',
|
'Pisteet',
|
||||||
].join(' │ ');
|
].join(' │ ');
|
||||||
console.log(`║ ${header} ║`);
|
console.log(`║ ${header} ║`);
|
||||||
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
|
console.log('╠' + '═'.repeat(header.length + 2) + '╣');
|
||||||
@@ -454,8 +480,6 @@ async function main() {
|
|||||||
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
|
const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-';
|
||||||
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
const time = `${(r.totalDurationMs/1000).toFixed(0)}s`;
|
||||||
const speed = `${r.avgTokPerSec.toFixed(0)}`;
|
const speed = `${r.avgTokPerSec.toFixed(0)}`;
|
||||||
const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL';
|
|
||||||
|
|
||||||
const row = [
|
const row = [
|
||||||
r.model.padEnd(40),
|
r.model.padEnd(40),
|
||||||
r.scenario.padEnd(10),
|
r.scenario.padEnd(10),
|
||||||
@@ -465,7 +489,7 @@ async function main() {
|
|||||||
ctx.padEnd(7),
|
ctx.padEnd(7),
|
||||||
time.padEnd(8),
|
time.padEnd(8),
|
||||||
speed.padEnd(8),
|
speed.padEnd(8),
|
||||||
verdict,
|
`${r.stars} ${r.score}`,
|
||||||
].join(' │ ');
|
].join(' │ ');
|
||||||
console.log(`║ ${row} ║`);
|
console.log(`║ ${row} ║`);
|
||||||
}
|
}
|
||||||
@@ -479,7 +503,8 @@ async function main() {
|
|||||||
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
|
const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0);
|
||||||
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
|
const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0);
|
||||||
const failed = results.filter(r => r.error || r.testsTotal === 0);
|
const failed = results.filter(r => r.error || r.testsTotal === 0);
|
||||||
console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
|
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
|
||||||
|
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch(e => { console.error(e); process.exit(1); });
|
main().catch(e => { console.error(e); process.exit(1); });
|
||||||
|
|||||||
Reference in New Issue
Block a user