diff --git a/kipina-codebench/benchmark.mjs b/kipina-codebench/benchmark.mjs index f49da1e..ef451dc 100644 --- a/kipina-codebench/benchmark.mjs +++ b/kipina-codebench/benchmark.mjs @@ -46,9 +46,24 @@ function loadPrompt(name) { } const CLIENT_SYSTEM = loadPrompt('client'); const SPEC_SYSTEM = loadPrompt('spec'); -const CODE_SYSTEM = loadPrompt(LANG === 'rust' ? 'code-rs' : 'code'); const FIX_SYSTEM = loadPrompt('fix'); +// === Mallikohtaiset profiilit === +const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8')); +function getCodePromptForModel(model) { + const modelConf = PROFILES.models[model]; + const profile = modelConf?.profile || PROFILES.default_profile; + const promptName = modelConf?.prompt || PROFILES.profiles[profile]?.prompt || 'code'; + const suffix = LANG === 'rust' ? '-rs' : ''; + // Yritä kielispesifistä ensin (code-small-rs), sitten perus (code-small) + const candidates = [`${promptName}${suffix}`, promptName, `code${suffix}`, 'code']; + for (const name of candidates) { + const path = join(__dirname, 'prompts', `${name}.md`); + if (existsSync(path)) return { system: readFileSync(path, 'utf-8').trim(), promptName: name, profile }; + } + return { system: loadPrompt('code'), promptName: 'code', profile: 'large' }; +} + // === Kultaisten esimerkkien lataus (kielen mukaan) === const GOLDEN_DIR = join(__dirname, 'golden-examples'); const LANG_CONFIG = { @@ -281,6 +296,7 @@ async function runPipeline(model, scenario) { error: null, }; const timings = []; + const { system: CODE_SYSTEM, promptName, profile } = getCodePromptForModel(model); const dir = `${OUTPUT_DIR}/${model.replace(/[/:]/g, '_')}__${scenario.id}`; mkdirSync(dir, { recursive: true }); @@ -444,6 +460,8 @@ async function runPipeline(model, scenario) { result.avgTokPerSec = timings.length > 0 ? timings.reduce((s, t) => s + t.tokPerSec, 0) / timings.length : 0; result.score = scoreResult(result); result.stars = starsForScore(result.score); + result.profile = profile; + result.promptName = promptName; return result; } diff --git a/kipina-codebench/profiles.json b/kipina-codebench/profiles.json new file mode 100644 index 0000000..6ed57dd --- /dev/null +++ b/kipina-codebench/profiles.json @@ -0,0 +1,47 @@ +{ + "models": { + "qwen3-coder:30b": { + "profile": "large", + "prompt": "code", + "golden": "todo.md", + "notes": "Pääkooderi. 97p, 188 tok/s. Noudattaa pitkiä sääntölistoja." + }, + "qwen3:8b": { + "profile": "small", + "prompt": "code-small", + "golden": "todo.md", + "notes": "Kevyt vaihtoehto. Todo/users 100p, blog heikko. Lyhyt prompti toimii paremmin." + }, + "qwen3:14b": { + "profile": "large", + "prompt": "code", + "golden": "todo.md", + "notes": "Poistettu käytöstä. Ei lisäarvoa 30b:hen verrattuna." + }, + "codestral:22b": { + "profile": "large", + "prompt": "code", + "golden": "todo.md", + "notes": "Mistral-varamalli. 88p, 44 tok/s." + }, + "qwen3:4b": { + "profile": "small", + "prompt": "code-small", + "golden": "todo.md", + "notes": "Minimaali. Vain todo toimii." + } + }, + "profiles": { + "large": { + "prompt": "code", + "golden": "todo.md", + "description": "Täysi prompti + säännöt. Malleille >=14B." + }, + "small": { + "prompt": "code-small", + "golden": "todo.md", + "description": "Tiivistetty prompti. Malleille <=8B." + } + }, + "default_profile": "large" +} diff --git a/kipina-codebench/prompts/code-small.md b/kipina-codebench/prompts/code-small.md new file mode 100644 index 0000000..9a17ecc --- /dev/null +++ b/kipina-codebench/prompts/code-small.md @@ -0,0 +1,22 @@ +Generate a FastAPI project with SQLAlchemy and SQLite. Follow the REFERENCE IMPLEMENTATION exactly. + +Generate these 4 files with === markers: + +=== models.py === +=== schemas.py === +=== main.py === +=== test_main.py === + +Key patterns (copy from reference): +- class Base(DeclarativeBase): pass +- Mapped[str] = mapped_column(String(255)) +- Mapped[str | None] = mapped_column(Text, default=None) +- model_config = ConfigDict(from_attributes=True) +- model_dump() not dict() +- POST 201, GET list, GET by id 404, PUT, DELETE 204 + +CRITICAL: +- Use ONLY fields from the JSON spec — no created_at or extra fields +- Generate EXACTLY 6 tests per entity: create, list, get_by_id, not_found, update, delete +- No search, filter, or other extra tests +- No markdown fences in output diff --git a/kipina-codebench/results/2026-04-14T09-43.html b/kipina-codebench/results/2026-04-14T09-43.html new file mode 100644 index 0000000..e77d625 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T09-43.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T09-43.json b/kipina-codebench/results/2026-04-14T09-43.json new file mode 100644 index 0000000..c54701c --- /dev/null +++ b/kipina-codebench/results/2026-04-14T09-43.json @@ -0,0 +1,22 @@ +[ + { + "model": "qwen3-coder:30b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 21688, + "totalTokens": 2243, + "avgTokPerSec": 121.7719614197307, + "promptChars": 11588, + "promptTokensEst": 2897, + "score": 100, + "stars": "★★★★★", + "error": null + } +] \ No newline at end of file diff --git a/kipina-codebench/results/2026-04-14T09-44.html b/kipina-codebench/results/2026-04-14T09-44.html new file mode 100644 index 0000000..b8ea684 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T09-44.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T09-44.json b/kipina-codebench/results/2026-04-14T09-44.json new file mode 100644 index 0000000..b011487 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T09-44.json @@ -0,0 +1,62 @@ +[ + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 23521, + "totalTokens": 2090, + "avgTokPerSec": 100.94324085271073, + "promptChars": 10962, + "promptTokensEst": 2741, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 1, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 33680, + "totalTokens": 3003, + "avgTokPerSec": 100.52754588753601, + "promptChars": 10171, + "promptTokensEst": 2543, + "score": 90, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui" + } +] \ No newline at end of file diff --git a/kipina-codebench/results/2026-04-14T09-47.html b/kipina-codebench/results/2026-04-14T09-47.html new file mode 100644 index 0000000..448c02e --- /dev/null +++ b/kipina-codebench/results/2026-04-14T09-47.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T09-47.json b/kipina-codebench/results/2026-04-14T09-47.json new file mode 100644 index 0000000..0275dd6 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T09-47.json @@ -0,0 +1,62 @@ +[ + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 8, + "testsPassed": 6, + "testsFailed": 2, + "totalDurationMs": 97470, + "totalTokens": 8786, + "avgTokPerSec": 97.96636139685832, + "promptChars": 11290, + "promptTokensEst": 2823, + "score": 65, + "stars": "★★★☆☆", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 18951, + "totalTokens": 1666, + "avgTokPerSec": 101.807593927545, + "promptChars": 10293, + "promptTokensEst": 2573, + "score": 100, + "stars": "★★★★★", + "error": null + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 126005, + "totalTokens": 11056, + "avgTokPerSec": 96.6373549161171, + "promptChars": 11878, + "promptTokensEst": 2970, + "score": 20, + "stars": "★☆☆☆☆", + "error": "Syntaksivirhe" + } +] \ No newline at end of file diff --git a/kipina-codebench/results/2026-04-14T09-52.html b/kipina-codebench/results/2026-04-14T09-52.html new file mode 100644 index 0000000..562f3ec --- /dev/null +++ b/kipina-codebench/results/2026-04-14T09-52.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T09-52.json b/kipina-codebench/results/2026-04-14T09-52.json new file mode 100644 index 0000000..4e366ef --- /dev/null +++ b/kipina-codebench/results/2026-04-14T09-52.json @@ -0,0 +1,947 @@ +[ + { + "model": "qwen3-coder:30b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 25444, + "totalTokens": 2661, + "avgTokPerSec": 122.06801173056196, + "promptChars": 11849, + "promptTokensEst": 2962, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 1 + }, + { + "model": "qwen3-coder:30b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 24447, + "totalTokens": 2537, + "avgTokPerSec": 121.11837170891442, + "promptChars": 11045, + "promptTokensEst": 2761, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 1 + }, + { + "model": "qwen3-coder:30b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 11, + "testsPassed": 11, + "testsFailed": 0, + "totalDurationMs": 38071, + "totalTokens": 3965, + "avgTokPerSec": 120.37309655579647, + "promptChars": 12702, + "promptTokensEst": 3176, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 1 + }, + { + "model": "qwen3:14b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 38459, + "totalTokens": 2106, + "avgTokPerSec": 60.889088461567745, + "promptChars": 10951, + "promptTokensEst": 2738, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 1 + }, + { + "model": "qwen3:14b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 35959, + "totalTokens": 1966, + "avgTokPerSec": 60.9684885562545, + "promptChars": 10698, + "promptTokensEst": 2675, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 1 + }, + { + "model": "qwen3:14b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 13, + "testsPassed": 2, + "testsFailed": 11, + "totalDurationMs": 269370, + "totalTokens": 14361, + "avgTokPerSec": 57.79069860126629, + "promptChars": 11838, + "promptTokensEst": 2960, + "score": 29, + "stars": "★★☆☆☆", + "error": null, + "round": 1 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 23199, + "totalTokens": 2054, + "avgTokPerSec": 101.09280595816365, + "promptChars": 10854, + "promptTokensEst": 2714, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 1 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 1, + "testsPassed": 0, + "testsFailed": 1, + "totalDurationMs": 72665, + "totalTokens": 6586, + "avgTokPerSec": 99.40636298490288, + "promptChars": 10157, + "promptTokensEst": 2539, + "score": 20, + "stars": "★☆☆☆☆", + "error": "Syntaksivirhe", + "round": 1 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 136309, + "totalTokens": 12036, + "avgTokPerSec": 97.02525169408467, + "promptChars": 10823, + "promptTokensEst": 2706, + "score": 0, + "stars": "☆☆☆☆☆", + "error": "Testit kaatuivat", + "round": 1 + }, + { + "model": "qwen3-coder:30b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 28177, + "totalTokens": 2946, + "avgTokPerSec": 121.23541038097, + "promptChars": 11836, + "promptTokensEst": 2959, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 2 + }, + { + "model": "qwen3-coder:30b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 8, + "testsPassed": 8, + "testsFailed": 0, + "totalDurationMs": 22631, + "totalTokens": 2352, + "avgTokPerSec": 121.93930190168658, + "promptChars": 10440, + "promptTokensEst": 2610, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 2 + }, + { + "model": "qwen3-coder:30b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 12, + "testsPassed": 12, + "testsFailed": 0, + "totalDurationMs": 40394, + "totalTokens": 4225, + "avgTokPerSec": 120.84107397324551, + "promptChars": 12362, + "promptTokensEst": 3091, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 2 + }, + { + "model": "qwen3:14b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 7, + "testsPassed": 7, + "testsFailed": 0, + "totalDurationMs": 46081, + "totalTokens": 2542, + "avgTokPerSec": 60.93046828700026, + "promptChars": 11412, + "promptTokensEst": 2853, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 2 + }, + { + "model": "qwen3:14b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 7, + "testsPassed": 7, + "testsFailed": 0, + "totalDurationMs": 41323, + "totalTokens": 2272, + "avgTokPerSec": 60.99406174164295, + "promptChars": 10884, + "promptTokensEst": 2721, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 2 + }, + { + "model": "qwen3:14b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 14, + "testsPassed": 2, + "testsFailed": 12, + "totalDurationMs": 262591, + "totalTokens": 14129, + "avgTokPerSec": 57.91340837830759, + "promptChars": 12143, + "promptTokensEst": 3036, + "score": 29, + "stars": "★★☆☆☆", + "error": null, + "round": 2 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 24007, + "totalTokens": 2137, + "avgTokPerSec": 101.05982103292858, + "promptChars": 10756, + "promptTokensEst": 2689, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 2 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 7, + "testsPassed": 6, + "testsFailed": 1, + "totalDurationMs": 68739, + "totalTokens": 6199, + "avgTokPerSec": 98.9825675198183, + "promptChars": 10313, + "promptTokensEst": 2578, + "score": 71, + "stars": "★★★★☆", + "error": null, + "round": 2 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui", + "round": 2 + }, + { + "model": "qwen3-coder:30b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 23472, + "totalTokens": 2427, + "avgTokPerSec": 120.85293828875076, + "promptChars": 11663, + "promptTokensEst": 2916, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3-coder:30b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 8, + "testsPassed": 8, + "testsFailed": 0, + "totalDurationMs": 25864, + "totalTokens": 2671, + "avgTokPerSec": 120.6883137195962, + "promptChars": 11148, + "promptTokensEst": 2787, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3-coder:30b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 12, + "testsPassed": 12, + "testsFailed": 0, + "totalDurationMs": 41074, + "totalTokens": 4275, + "avgTokPerSec": 120.33351485161673, + "promptChars": 12664, + "promptTokensEst": 3166, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3:14b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 7, + "testsPassed": 7, + "testsFailed": 0, + "totalDurationMs": 40457, + "totalTokens": 2229, + "avgTokPerSec": 61.093615619948345, + "promptChars": 10905, + "promptTokensEst": 2726, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3:14b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 1, + "testsTotal": 7, + "testsPassed": 7, + "testsFailed": 0, + "totalDurationMs": 77506, + "totalTokens": 4268, + "avgTokPerSec": 60.19655522627278, + "promptChars": 11135, + "promptTokensEst": 2784, + "score": 90, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3:14b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 12, + "testsPassed": 12, + "testsFailed": 0, + "totalDurationMs": 74791, + "totalTokens": 3590, + "avgTokPerSec": 60.549298891176214, + "promptChars": 11653, + "promptTokensEst": 2913, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 26402, + "totalTokens": 2358, + "avgTokPerSec": 100.76936895480246, + "promptChars": 11243, + "promptTokensEst": 2811, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 20751, + "totalTokens": 1837, + "avgTokPerSec": 101.05480893032836, + "promptChars": 10553, + "promptTokensEst": 2638, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui", + "round": 3 + }, + { + "model": "qwen3-coder:30b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 22098, + "totalTokens": 2283, + "avgTokPerSec": 121.81254413612446, + "promptChars": 11503, + "promptTokensEst": 2876, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 4 + }, + { + "model": "qwen3-coder:30b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 2, + "testsTotal": 8, + "testsPassed": 8, + "testsFailed": 0, + "totalDurationMs": 65403, + "totalTokens": 6779, + "avgTokPerSec": 118.13288294758586, + "promptChars": 10939, + "promptTokensEst": 2735, + "score": 80, + "stars": "★★★★☆", + "error": null, + "round": 4 + }, + { + "model": "qwen3-coder:30b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 10, + "testsPassed": 10, + "testsFailed": 0, + "totalDurationMs": 36044, + "totalTokens": 3748, + "avgTokPerSec": 120.14822967005487, + "promptChars": 12639, + "promptTokensEst": 3160, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 4 + }, + { + "model": "qwen3:14b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 38501, + "totalTokens": 2113, + "avgTokPerSec": 61.01814139430428, + "promptChars": 10929, + "promptTokensEst": 2732, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 4 + }, + { + "model": "qwen3:14b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 8, + "testsPassed": 1, + "testsFailed": 7, + "totalDurationMs": 147057, + "totalTokens": 7799, + "avgTokPerSec": 56.209406465865904, + "promptChars": 11207, + "promptTokensEst": 2802, + "score": 28, + "stars": "★★☆☆☆", + "error": null, + "round": 4 + }, + { + "model": "qwen3:14b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 12, + "testsPassed": 12, + "testsFailed": 0, + "totalDurationMs": 227508, + "totalTokens": 12026, + "avgTokPerSec": 58.52888492610325, + "promptChars": 11809, + "promptTokensEst": 2952, + "score": 80, + "stars": "★★★★☆", + "error": null, + "round": 4 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 11, + "testsPassed": 11, + "testsFailed": 0, + "totalDurationMs": 131964, + "totalTokens": 11403, + "avgTokPerSec": 97.10963264920952, + "promptChars": 11786, + "promptTokensEst": 2947, + "score": 80, + "stars": "★★★★☆", + "error": null, + "round": 4 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 38820, + "totalTokens": 1826, + "avgTokPerSec": 101.07773707712924, + "promptChars": 10568, + "promptTokensEst": 2642, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 4 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui", + "round": 4 + }, + { + "model": "qwen3-coder:30b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 1, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 39797, + "totalTokens": 3776, + "avgTokPerSec": 120.91801837211113, + "promptChars": 11435, + "promptTokensEst": 2859, + "score": 90, + "stars": "★★★★★", + "error": null, + "round": 5 + }, + { + "model": "qwen3-coder:30b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 9, + "testsPassed": 8, + "testsFailed": 1, + "totalDurationMs": 87836, + "totalTokens": 9343, + "avgTokPerSec": 119.28783662683314, + "promptChars": 10718, + "promptTokensEst": 2680, + "score": 73, + "stars": "★★★★☆", + "error": null, + "round": 5 + }, + { + "model": "qwen3-coder:30b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 10, + "testsPassed": 10, + "testsFailed": 0, + "totalDurationMs": 36644, + "totalTokens": 3897, + "avgTokPerSec": 122.28607796191666, + "promptChars": 12598, + "promptTokensEst": 3150, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 5 + }, + { + "model": "qwen3:14b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 1, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 127532, + "totalTokens": 3919, + "avgTokPerSec": 34.13133325491828, + "promptChars": 11352, + "promptTokensEst": 2838, + "score": 90, + "stars": "★★★★★", + "error": null, + "round": 5 + }, + { + "model": "qwen3:14b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 8, + "testsPassed": 6, + "testsFailed": 2, + "totalDurationMs": 217365, + "totalTokens": 7764, + "avgTokPerSec": 38.67613170588518, + "promptChars": 10834, + "promptTokensEst": 2709, + "score": 65, + "stars": "★★★☆☆", + "error": null, + "round": 5 + }, + { + "model": "qwen3:14b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 14, + "testsPassed": 7, + "testsFailed": 7, + "totalDurationMs": 248311, + "totalTokens": 13443, + "avgTokPerSec": 58.05680015263308, + "promptChars": 12219, + "promptTokensEst": 3055, + "score": 50, + "stars": "★★★☆☆", + "error": null, + "round": 5 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 38326, + "totalTokens": 2079, + "avgTokPerSec": 100.89778087504016, + "promptChars": 10908, + "promptTokensEst": 2727, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 5 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 60823, + "totalTokens": 1772, + "avgTokPerSec": 96.76383996716295, + "promptChars": 10378, + "promptTokensEst": 2595, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 5 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 11, + "testsPassed": 11, + "testsFailed": 0, + "totalDurationMs": 81654, + "totalTokens": 3458, + "avgTokPerSec": 95.65675360193613, + "promptChars": 11914, + "promptTokensEst": 2979, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 5 + } +] \ No newline at end of file diff --git a/kipina-codebench/results/2026-04-14T10-03.html b/kipina-codebench/results/2026-04-14T10-03.html new file mode 100644 index 0000000..b7b16cf --- /dev/null +++ b/kipina-codebench/results/2026-04-14T10-03.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T10-03.json b/kipina-codebench/results/2026-04-14T10-03.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T10-03.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/kipina-codebench/results/2026-04-14T10-31.html b/kipina-codebench/results/2026-04-14T10-31.html new file mode 100644 index 0000000..815a791 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T10-31.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T10-31.json b/kipina-codebench/results/2026-04-14T10-31.json new file mode 100644 index 0000000..3a904cf --- /dev/null +++ b/kipina-codebench/results/2026-04-14T10-31.json @@ -0,0 +1,317 @@ +[ + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 97527, + "totalTokens": 2228, + "avgTokPerSec": 100.69171830800946, + "promptChars": 11566, + "promptTokensEst": 2892, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 1 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 7, + "testsPassed": 7, + "testsFailed": 0, + "totalDurationMs": 39549, + "totalTokens": 1960, + "avgTokPerSec": 100.98265593129491, + "promptChars": 11073, + "promptTokensEst": 2768, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 1 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui", + "round": 1 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 131339, + "totalTokens": 11518, + "avgTokPerSec": 96.52358107464266, + "promptChars": 12388, + "promptTokensEst": 3097, + "score": 0, + "stars": "☆☆☆☆☆", + "error": "Testit kaatuivat", + "round": 2 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 20658, + "totalTokens": 1808, + "avgTokPerSec": 101.0081173861862, + "promptChars": 11057, + "promptTokensEst": 2764, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 2 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui", + "round": 2 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 1, + "fixRounds": 5, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 320031, + "totalTokens": 11985, + "avgTokPerSec": 54.915025374575386, + "promptChars": 12517, + "promptTokensEst": 3129, + "score": 0, + "stars": "☆☆☆☆☆", + "error": "Testit kaatuivat", + "round": 3 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 7, + "testsPassed": 7, + "testsFailed": 0, + "totalDurationMs": 28654, + "totalTokens": 1877, + "avgTokPerSec": 100.70920643946336, + "promptChars": 10747, + "promptTokensEst": 2687, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 3 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui", + "round": 3 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 1, + "testsTotal": 12, + "testsPassed": 12, + "testsFailed": 0, + "totalDurationMs": 67943, + "totalTokens": 6002, + "avgTokPerSec": 98.29436788902672, + "promptChars": 12389, + "promptTokensEst": 3097, + "score": 90, + "stars": "★★★★★", + "error": null, + "round": 4 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 20203, + "totalTokens": 1774, + "avgTokPerSec": 100.9066297884274, + "promptChars": 10905, + "promptTokensEst": 2726, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 4 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 13, + "testsPassed": 12, + "testsFailed": 1, + "totalDurationMs": 148491, + "totalTokens": 12747, + "avgTokPerSec": 95.18237885727869, + "promptChars": 12476, + "promptTokensEst": 3119, + "score": 75, + "stars": "★★★★☆", + "error": null, + "round": 4 + }, + { + "model": "qwen3:8b", + "scenario": "todo", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 6, + "testsPassed": 6, + "testsFailed": 0, + "totalDurationMs": 23830, + "totalTokens": 2102, + "avgTokPerSec": 100.641489789061, + "promptChars": 11404, + "promptTokensEst": 2851, + "score": 100, + "stars": "★★★★★", + "error": null, + "round": 5 + }, + { + "model": "qwen3:8b", + "scenario": "users", + "reqOk": true, + "specOk": true, + "specEntities": 1, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 8, + "testsPassed": 6, + "testsFailed": 2, + "totalDurationMs": 122453, + "totalTokens": 7285, + "avgTokPerSec": 94.12482830400619, + "promptChars": 11400, + "promptTokensEst": 2850, + "score": 65, + "stars": "★★★☆☆", + "error": null, + "round": 5 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 11, + "testsPassed": 10, + "testsFailed": 1, + "totalDurationMs": 147125, + "totalTokens": 9893, + "avgTokPerSec": 97.37021605085566, + "promptChars": 12455, + "promptTokensEst": 3114, + "score": 75, + "stars": "★★★★☆", + "error": null, + "round": 5 + } +] \ No newline at end of file