From 9f2899b83da426cbcb53080e6f99549cbfb5d2c1 Mon Sep 17 00:00:00 2001 From: jaakko Date: Tue, 14 Apr 2026 08:10:27 +0300 Subject: [PATCH] =?UTF-8?q?Benchmark:=20pisteytys=20(0-100)=20ja=20t=C3=A4?= =?UTF-8?q?htiluokitus=20tuloksissa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pisteytys: speksi 10p + koodi 10p + testit 60p + korjaukset 20p. Tähdet: ★★★★★ (90+), ★★★★☆ (70+), ★★★☆☆ (50+), ★★☆☆☆ (25+), ★☆☆☆☆ (1+). Näkyy per-ajo rivillä, tulostaulussa ja yhteenvedossa. --- network-poc/tests/model-benchmark.mjs | 37 ++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/network-poc/tests/model-benchmark.mjs b/network-poc/tests/model-benchmark.mjs index 7d99763..3e4dbea 100644 --- a/network-poc/tests/model-benchmark.mjs +++ b/network-poc/tests/model-benchmark.mjs @@ -273,6 +273,7 @@ async function runPipeline(model, scenario) { testsTotal: 0, testsPassed: 0, testsFailed: 0, totalDurationMs: 0, totalTokens: 0, avgTokPerSec: 0, promptChars: 0, promptTokensEst: 0, + score: 0, stars: '', error: null, }; const timings = []; @@ -376,10 +377,35 @@ async function runPipeline(model, scenario) { result.totalDurationMs = timings.reduce((s, t) => s + t.durationMs, 0); result.totalTokens = timings.reduce((s, t) => s + t.tokens, 0); result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0; + result.score = scoreResult(result); + result.stars = starsForScore(result.score); return result; } +// === Pisteytys (0–100) ja tähtiluokitus === +function scoreResult(r) { + if (r.error && r.testsTotal === 0) return 0; + let score = 0; + // Speksi onnistui (10p) + if (r.specOk) score += 10; + // Koodi generoitu (10p) + if (!r.error || r.testsTotal > 0) score += 10; + // Testien läpäisy (60p) + if (r.testsTotal > 0) score += Math.round((r.testsPassed / r.testsTotal) * 60); + // Korjauskierrokset (20p: 0×=20, 1×=10, 2×=0) + score += Math.max(0, 20 - r.fixRounds * 10); + return Math.min(100, score); +} +function starsForScore(score) { + if (score >= 90) return '★★★★★'; + if (score >= 70) return '★★★★☆'; + if (score >= 50) return '★★★☆☆'; + if (score >= 25) return '★★☆☆☆'; + if (score > 0) return '★☆☆☆☆'; + return '☆☆☆☆☆'; +} + // === Main === async function main() { console.log('╔══════════════════════════════════════════════╗'); @@ -424,7 +450,7 @@ async function main() { r.testsPassed === r.testsTotal && r.testsTotal > 0 ? `✓ ${r.testsPassed}/${r.testsTotal}` : `◐ ${r.testsPassed}/${r.testsTotal}`; const ctxInfo = r.promptTokensEst > 0 ? ` | ctx ~${(r.promptTokensEst/1000).toFixed(1)}K` : ''; - console.log(` → ${status} | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`); + console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`); } } @@ -442,7 +468,7 @@ async function main() { 'Ctx'.padEnd(7), 'Aika'.padEnd(8), 'tok/s'.padEnd(8), - 'Tulos', + 'Pisteet', ].join(' │ '); console.log(`║ ${header} ║`); console.log('╠' + '═'.repeat(header.length + 2) + '╣'); @@ -454,8 +480,6 @@ async function main() { const ctx = r.promptTokensEst > 0 ? `~${(r.promptTokensEst/1000).toFixed(1)}K` : '-'; const time = `${(r.totalDurationMs/1000).toFixed(0)}s`; const speed = `${r.avgTokPerSec.toFixed(0)}`; - const verdict = r.error ? '✗ FAIL' : r.testsPassed === r.testsTotal && r.testsTotal > 0 ? '✓ PASS' : '◐ PARTIAL'; - const row = [ r.model.padEnd(40), r.scenario.padEnd(10), @@ -465,7 +489,7 @@ async function main() { ctx.padEnd(7), time.padEnd(8), speed.padEnd(8), - verdict, + `${r.stars} ${r.score}`, ].join(' │ '); console.log(`║ ${row} ║`); } @@ -479,7 +503,8 @@ async function main() { const passed = results.filter(r => !r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0); const partial = results.filter(r => !r.error && r.testsPassed < r.testsTotal && r.testsTotal > 0); const failed = results.filter(r => r.error || r.testsTotal === 0); - console.log(`\n✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`); + const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0; + console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length}`); } main().catch(e => { console.error(e); process.exit(1); });