CodeBench: --rounds N toistaa testiajot 1-10 kertaa
Kierrosyhteenveto näyttää mediaanin, min/max ja pass-raten per kierros. Käyttö: node benchmark.mjs --models qwen3:14b --scenarios all --rounds 3
This commit is contained in:
@@ -35,6 +35,7 @@ const RESULTS_DIR = join(__dirname, 'results');
|
|||||||
const THINK_MODE = args.includes('--think');
|
const THINK_MODE = args.includes('--think');
|
||||||
const COMPACT_MODE = args.includes('--compact');
|
const COMPACT_MODE = args.includes('--compact');
|
||||||
const LANG = arg('lang', 'python'); // python | rust
|
const LANG = arg('lang', 'python'); // python | rust
|
||||||
|
const ROUNDS = parseInt(arg('rounds', '1')); // 1-10 toistoa
|
||||||
const MAX_FIX_ROUNDS = 2;
|
const MAX_FIX_ROUNDS = 2;
|
||||||
|
|
||||||
// === Promptien lataus tiedostoista ===
|
// === Promptien lataus tiedostoista ===
|
||||||
@@ -404,6 +405,7 @@ async function main() {
|
|||||||
|
|
||||||
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
|
const scenarios = SCENARIO_FILTER === 'all' ? SCENARIOS : [SCENARIOS[0]];
|
||||||
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
|
console.log(`Skenaariot (${scenarios.length}): ${scenarios.map(s => s.id).join(', ')}`);
|
||||||
|
if (ROUNDS > 1) console.log(`Toistoja: ${ROUNDS}`);
|
||||||
console.log(`Tulokset: ${OUTPUT_DIR}/`);
|
console.log(`Tulokset: ${OUTPUT_DIR}/`);
|
||||||
console.log('');
|
console.log('');
|
||||||
|
|
||||||
@@ -413,10 +415,14 @@ async function main() {
|
|||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
|
|
||||||
|
for (let round = 1; round <= ROUNDS; round++) {
|
||||||
|
if (ROUNDS > 1) console.log(`\n╔═══ Kierros ${round}/${ROUNDS} ═══╗`);
|
||||||
for (const model of models) {
|
for (const model of models) {
|
||||||
for (const scenario of scenarios) {
|
for (const scenario of scenarios) {
|
||||||
console.log(`\n━━━ ${model} × ${scenario.id} ━━━`);
|
const roundLabel = ROUNDS > 1 ? ` [${round}/${ROUNDS}]` : '';
|
||||||
|
console.log(`\n━━━ ${model} × ${scenario.id}${roundLabel} ━━━`);
|
||||||
const r = await runPipeline(model, scenario);
|
const r = await runPipeline(model, scenario);
|
||||||
|
if (ROUNDS > 1) r.round = round;
|
||||||
results.push(r);
|
results.push(r);
|
||||||
|
|
||||||
const status = r.error ? `✗ ${r.error}` :
|
const status = r.error ? `✗ ${r.error}` :
|
||||||
@@ -426,6 +432,7 @@ async function main() {
|
|||||||
console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
console.log(` → ${status} | ${r.stars} ${r.score}p | ${(r.totalDurationMs/1000).toFixed(1)}s | ${r.totalTokens} tok | ${r.avgTokPerSec.toFixed(1)} tok/s${ctxInfo}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} // rounds
|
||||||
|
|
||||||
// === Tulostaulu ===
|
// === Tulostaulu ===
|
||||||
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
|
console.log('\n\n╔══════════════════════════════════════════════════════════════════════════════════════════════════╗');
|
||||||
@@ -540,6 +547,28 @@ async function main() {
|
|||||||
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
|
const avgScore = results.length > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length) : 0;
|
||||||
const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
|
const totalTime = results.reduce((s, r) => s + r.totalDurationMs, 0);
|
||||||
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
|
console.log(`\n${starsForScore(avgScore)} Keskiarvo: ${avgScore}p | ✓ PASS: ${passed.length} | ◐ PARTIAL: ${partial.length} | ✗ FAIL: ${failed.length} | Yhteensä: ${results.length} | Kokonaisaika: ${(totalTime/1000/60).toFixed(1)} min`);
|
||||||
|
|
||||||
|
// === Kierrosyhteenveto (kun rounds > 1) ===
|
||||||
|
if (ROUNDS > 1) {
|
||||||
|
console.log('\n\n╔══════════════════════════════════════════════╗');
|
||||||
|
console.log('║ KIERROSYHTEENVETO (mediaani) ║');
|
||||||
|
console.log('╚══════════════════════════════════════════════╝\n');
|
||||||
|
const median = arr => { const s = [...arr].sort((a,b) => a-b); const m = Math.floor(s.length/2); return s.length % 2 ? s[m] : Math.round((s[m-1]+s[m])/2); };
|
||||||
|
|
||||||
|
for (const model of modelNames) {
|
||||||
|
const mrs = results.filter(r => r.model === model);
|
||||||
|
for (const sid of scenarioIds) {
|
||||||
|
const runs = mrs.filter(r => r.scenario === sid);
|
||||||
|
if (runs.length === 0) continue;
|
||||||
|
const scores = runs.map(r => r.score);
|
||||||
|
const med = median(scores);
|
||||||
|
const min = Math.min(...scores);
|
||||||
|
const max = Math.max(...scores);
|
||||||
|
const passRates = runs.map(r => r.testsTotal > 0 ? Math.round(r.testsPassed/r.testsTotal*100) : 0);
|
||||||
|
console.log(`${model.padEnd(30)} ${sid.padEnd(8)} ${starsForScore(med)} med:${med}p min:${min} max:${max} pass:[${passRates.join(',')}]%`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch(e => { console.error(e); process.exit(1); });
|
main().catch(e => { console.error(e); process.exit(1); });
|
||||||
|
|||||||
Reference in New Issue
Block a user