CodeBench: mallikohtainen golden example (profiles.json → golden kenttä)

qwen3-coder:30b → todo.md (annotaatiot) qwen3:8b → todo-readme.md (GitHub README -muoto, tutuin koulutusdata) Golden example ladataan dynaamisesti per malli pipelinen sisällä.
2026-04-14 14:04:28 +03:00
parent 0c3303a640
commit a25c52cff4
5 changed files with 483 additions and 8 deletions
--- a/kipina-codebench/benchmark.mjs
+++ b/kipina-codebench/benchmark.mjs
@@ -50,6 +50,12 @@ const FIX_SYSTEM = loadPrompt('fix');
 // === Mallikohtaiset profiilit ===
 const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8'));
 function getGoldenForModel(model) {
    const modelConf = PROFILES.models[model];
    const goldenFile = modelConf?.golden || 'todo.md';
    return goldenFile;
 }
 function getCodePromptForModel(model) {
    const modelConf = PROFILES.models[model];
    const profile = modelConf?.profile || PROFILES.default_profile;
@@ -82,16 +88,16 @@ const LANG_CONFIG = {
 };
 const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python;
-function loadGoldenExample() {
+function loadGoldenExample(model) {
    // --compact: käytä tiivistettyä templaattia
    if (COMPACT_MODE) {
        const compactFile = LANG === 'rust' ? 'golden-compact-rs.md' : 'golden-compact-py.md';
        const compactPath = join(__dirname, 'prompts', compactFile);
        if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n';
    }
-    // Markdown golden example (koodi + selitykset)
+    // Mallikohtainen golden example profiilista
-    const mdName = LANG === 'rust' ? 'todo-rs.md' : 'todo.md';
+    const goldenFile = model ? getGoldenForModel(model) : (LANG === 'rust' ? 'todo-rs.md' : 'todo.md');
-    const mdPath = join(GOLDEN_DIR, mdName);
+    const mdPath = join(GOLDEN_DIR, goldenFile);
    if (existsSync(mdPath)) return '\n' + readFileSync(mdPath, 'utf-8').trim() + '\n';
    // Fallback: erilliset tiedostot
    const todoDir = join(GOLDEN_DIR, LCONF.goldenDir);
@@ -103,7 +109,6 @@ function loadGoldenExample() {
    }
    return example;
 }
 const GOLDEN_EXAMPLE = loadGoldenExample();
 // === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) ===
 function stripThinking(text) {
@@ -322,7 +327,8 @@ async function runPipeline(model, scenario) {
        // 3. LLM-koodigenerointi
        console.log(`    [3/5] Koodigenerointi (LLM)...`);
        const fileCount = LCONF.required.length;
-        const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`;
+        const goldenExample = loadGoldenExample(model);
        const codePrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`;
        result.promptChars = CODE_SYSTEM.length + codePrompt.length;
        result.promptTokensEst = Math.round(result.promptChars / 4);
        const codeTokens = LANG === 'rust' ? 12288 : 8192;
--- a/kipina-codebench/golden-examples/todo-readme.md
+++ b/kipina-codebench/golden-examples/todo-readme.md
@@ -0,0 +1,217 @@
 # Todo App — FastAPI + SQLAlchemy + SQLite
 A simple todo CRUD API. Uses only the fields defined in the spec — no extra fields.
 ## Project Structure
 ```
 models.py       # SQLAlchemy 2.0 models
 schemas.py      # Pydantic v2 schemas
 main.py         # FastAPI CRUD endpoints
 test_main.py    # Pytest with TestClient
 ```
 ## models.py
 ```python
 """Tietokantamallit — SQLAlchemy 2.0, Mapped-tyypitys, SQLite."""
 from datetime import date
 from sqlalchemy import String, Text, Date, create_engine
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, sessionmaker
 DATABASE_URL = "sqlite:///./app.db"
 engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 class Base(DeclarativeBase):
    pass
 class Todo(Base):
    """Tehtävä — otsikko, kuvaus, deadline, prioriteetti ja status."""
    __tablename__ = "todos"
    id: Mapped[int] = mapped_column(primary_key=True, index=True)
    title: Mapped[str] = mapped_column(String(255))
    description: Mapped[str | None] = mapped_column(Text, default=None)
    due_date: Mapped[date | None] = mapped_column(Date, default=None)
    priority: Mapped[int] = mapped_column(default=1)
    status: Mapped[str] = mapped_column(String(20), default="pending")
 Base.metadata.create_all(bind=engine)
 ```
 ## schemas.py
 ```python
 """Pydantic v2 -skeemat — Create sisääntulolle, Response vastaukselle."""
 from datetime import date
 from pydantic import BaseModel, ConfigDict
 class TodoCreate(BaseModel):
    """Uuden tehtävän luonti. Pakolliset: title."""
    title: str
    description: str | None = None
    due_date: date | None = None
    priority: int = 1
    status: str = "pending"
 class TodoResponse(TodoCreate):
    """Palautettava tehtävä — sisältää id:n."""
    id: int
    model_config = ConfigDict(from_attributes=True)
 ```
 ## main.py
 ```python
 """FastAPI CRUD — yksi endpoint-setti per entiteetti."""
 from fastapi import FastAPI, Depends, HTTPException
 from sqlalchemy.orm import Session
 from models import SessionLocal, Todo
 from schemas import TodoCreate, TodoResponse
 app = FastAPI()
 def get_db():
    """Tietokantasessio per pyyntö."""
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()
@app.post("/todos/", response_model=TodoResponse, status_code=201)
 def create_todo(item: TodoCreate, db: Session = Depends(get_db)):
    db_item = Todo(**item.model_dump())
    db.add(db_item)
    db.commit()
    db.refresh(db_item)
    return db_item
@app.get("/todos/", response_model=list[TodoResponse])
 def list_todos(db: Session = Depends(get_db)):
    return db.query(Todo).all()
@app.get("/todos/{item_id}", response_model=TodoResponse)
 def get_todo(item_id: int, db: Session = Depends(get_db)):
    item = db.query(Todo).filter(Todo.id == item_id).first()
    if not item:
        raise HTTPException(status_code=404, detail="Todo not found")
    return item
@app.put("/todos/{item_id}", response_model=TodoResponse)
 def update_todo(item_id: int, item: TodoCreate, db: Session = Depends(get_db)):
    db_item = db.query(Todo).filter(Todo.id == item_id).first()
    if not db_item:
        raise HTTPException(status_code=404, detail="Todo not found")
    for key, value in item.model_dump().items():
        setattr(db_item, key, value)
    db.commit()
    db.refresh(db_item)
    return db_item
@app.delete("/todos/{item_id}", status_code=204)
 def delete_todo(item_id: int, db: Session = Depends(get_db)):
    db_item = db.query(Todo).filter(Todo.id == item_id).first()
    if not db_item:
        raise HTTPException(status_code=404, detail="Todo not found")
    db.delete(db_item)
    db.commit()
 ```
 ## test_main.py
 Exactly 6 tests per entity. Database is shared — use `>= 1` not `== 1` in list tests.
 For child entities with foreign keys: create parent FIRST, then child with parent's id.
 ```python
 """Pytest — TestClient, erillinen test.db, uniikki data per testi."""
 from fastapi.testclient import TestClient
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from main import app, get_db
 from models import Base
 test_engine = create_engine(
    "sqlite:///./test.db", connect_args={"check_same_thread": False}
 )
 TestSession = sessionmaker(autocommit=False, autoflush=False, bind=test_engine)
 Base.metadata.create_all(bind=test_engine)
 def override_get_db():
    db = TestSession()
    try:
        yield db
    finally:
        db.close()
 app.dependency_overrides[get_db] = override_get_db
 client = TestClient(app)
 def test_create_todo():
    response = client.post("/todos/", json={"title": "Osta maitoa", "priority": 2})
    assert response.status_code == 201
    assert response.json()["title"] == "Osta maitoa"
    assert "id" in response.json()
 def test_list_todos():
    client.post("/todos/", json={"title": "Listattava tehtävä"})
    response = client.get("/todos/")
    assert response.status_code == 200
    assert len(response.json()) >= 1
 def test_get_todo_by_id():
    created = client.post("/todos/", json={"title": "Haettava tehtävä"}).json()
    response = client.get(f"/todos/{created['id']}")
    assert response.status_code == 200
    assert response.json()["id"] == created["id"]
 def test_get_todo_not_found():
    response = client.get("/todos/99999")
    assert response.status_code == 404
 def test_update_todo():
    created = client.post("/todos/", json={"title": "Vanha otsikko"}).json()
    response = client.put(
        f"/todos/{created['id']}", json={"title": "Uusi otsikko"}
    )
    assert response.status_code == 200
    assert response.json()["title"] == "Uusi otsikko"
 def test_delete_todo():
    created = client.post("/todos/", json={"title": "Poistettava"}).json()
    response = client.delete(f"/todos/{created['id']}")
    assert response.status_code == 204
    response = client.get(f"/todos/{created['id']}")
    assert response.status_code == 404
 ```
--- a/kipina-codebench/profiles.json
+++ b/kipina-codebench/profiles.json
@@ -12,9 +12,9 @@
      "profile": "small",
      "role": "primary",
      "prompt": "code-small",
-      "golden": "todo.md",
+      "golden": "todo-readme.md",
      "vram": "8GB",
-      "notes": "Kevyt pääkooderi. Todo/users 100p, blog heikko. Lyhyt prompti toimii paremmin."
+      "notes": "Kevyt pääkooderi. Todo/users 100p, blog heikko. README-muoto golden examplelle."
    },
    "codestral:22b": {
      "profile": "large",
--- a/kipina-codebench/results/2026-04-14T10-59.html
+++ b/kipina-codebench/results/2026-04-14T10-59.html
@@ -0,0 +1,183 @@
 <!DOCTYPE html>
 <html lang="fi">
 <head>
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
 <title>Kipina Model Benchmark</title>
 <style>
  :root { --bg: #0d1117; --card: #161b22; --border: #30363d; --text: #e6edf3; --dim: #8b949e; --green: #3fb950; --yellow: #d29922; --red: #f85149; --blue: #58a6ff; }
  * { box-sizing: border-box; margin: 0; padding: 0; }
  body { font-family: -apple-system, 'Segoe UI', Helvetica, Arial, sans-serif; background: var(--bg); color: var(--text); padding: 2rem; max-width: 1400px; margin: 0 auto; }
  h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
  .meta { color: var(--dim); font-size: 0.85rem; margin-bottom: 2rem; }
  .cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
  .card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 1rem; }
  .card .label { color: var(--dim); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
  .card .value { font-size: 1.8rem; font-weight: 600; margin-top: 0.25rem; }
  .card .sub { color: var(--dim); font-size: 0.8rem; margin-top: 0.25rem; }
  table { width: 100%; border-collapse: collapse; background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; margin-bottom: 2rem; }
  th { background: #1c2128; text-align: left; padding: 0.6rem 0.8rem; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--dim); cursor: pointer; user-select: none; white-space: nowrap; }
  th:hover { color: var(--text); }
  th.sorted-asc::after { content: ' ▲'; }
  th.sorted-desc::after { content: ' ▼'; }
  td { padding: 0.5rem 0.8rem; border-top: 1px solid var(--border); font-size: 0.85rem; white-space: nowrap; }
  tr:hover td { background: #1c2128; }
  .pass { color: var(--green); }
  .partial { color: var(--yellow); }
  .fail { color: var(--red); }
  .stars { letter-spacing: 1px; }
  .bar { display: inline-block; height: 8px; border-radius: 4px; vertical-align: middle; }
  .bar-bg { background: var(--border); }
  .bar-fill { background: var(--green); }
  .bar-partial { background: var(--yellow); }
  .model-name { font-weight: 600; }
  h2 { font-size: 1.1rem; margin-bottom: 1rem; color: var(--dim); }
  .summary-table th:first-child, .summary-table td:first-child { min-width: 200px; }
 </style>
 </head>
 <body>
 <h1>Kipina Model Benchmark</h1>
 <div class="meta" id="meta"></div>
 <div class="cards" id="cards"></div>
 <h2>Mallikohtainen yhteenveto</h2>
 <table class="summary-table" id="summary-table"><thead></thead><tbody></tbody></table>
 <h2>Kaikki tulokset</h2>
 <table id="results-table"><thead></thead><tbody></tbody></table>
 <script>
 const RAW = [{"model":"qwen3:8b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":1,"testsTotal":11,"testsPassed":11,"testsFailed":0,"totalDurationMs":64124,"totalTokens":5689,"avgTokPerSec":98.61378134916481,"promptChars":12098,"promptTokensEst":3025,"score":90,"stars":"★★★★★","error":null,"profile":"small","promptName":"code-small","round":1},{"model":"qwen3:8b","scenario":"blog","reqOk":true,"specOk":true,"specEntities":2,"validationIssues":0,"fixRounds":3,"testsTotal":0,"testsPassed":0,"testsFailed":0,"totalDurationMs":126014,"totalTokens":11162,"avgTokPerSec":97.09858655726343,"promptChars":12101,"promptTokensEst":3025,"score":0,"stars":"☆☆☆☆☆","error":"Testit kaatuivat","profile":"small","promptName":"code-small","round":2},{"model":"qwen3:8b","scenario":"blog","reqOk":true,"specOk":false,"specEntities":0,"validationIssues":0,"fixRounds":0,"testsTotal":0,"testsPassed":0,"testsFailed":0,"totalDurationMs":0,"totalTokens":0,"avgTokPerSec":0,"promptChars":0,"promptTokensEst":0,"score":0,"stars":"","error":"JSON-speksi epäonnistui","round":3}];
 const starsFor = s => s >= 90 ? '★★★★★' : s >= 70 ? '★★★★☆' : s >= 50 ? '★★★☆☆' : s >= 25 ? '★★☆☆☆' : s > 0 ? '★☆☆☆☆' : '☆☆☆☆☆';
 function calcScore(r) {
  if (r.error && r.testsTotal === 0) return 0;
  let s = 0;
  if (r.specOk) s += 10;
  if (!r.error || r.testsTotal > 0) s += 10;
  if (r.testsTotal > 0) s += Math.round((r.testsPassed / r.testsTotal) * 60);
  s += Math.max(0, 20 - (r.fixRounds || 0) * 10);
  return Math.min(100, s);
 }
 // Laske pisteet jos puuttuvat
 const DATA = RAW.map(r => {
  if (r.score == null) r.score = calcScore(r);
  if (!r.stars) r.stars = starsFor(r.score);
  if (!r.promptTokensEst) r.promptTokensEst = r.promptChars ? Math.round(r.promptChars / 4) : 0;
  return r;
 });
 const cls = r => (!r.error && r.testsPassed === r.testsTotal && r.testsTotal > 0) ? 'pass' : (r.testsTotal > 0 && r.testsPassed > 0) ? 'partial' : 'fail';
 const pctBar = (passed, total, w=80) => {
  if (total === 0) return '-';
  const pct = passed/total*100;
  const c = pct === 100 ? 'bar-fill' : 'bar-partial';
  return `<span class="bar bar-bg" style="width:${w}px"><span class="bar ${c}" style="width:${Math.round(pct/100*w)}px"></span></span> ${passed}/${total}`;
 };
 // Meta
 const totalTime = DATA.reduce((s,r) => s + r.totalDurationMs, 0);
 document.getElementById('meta').textContent = `${new Date().toLocaleDateString('fi-FI')} — ${DATA.length} ajoa — ${(totalTime/1000/60).toFixed(1)} min`;
 // Cards
 const models = [...new Set(DATA.map(r => r.model))];
 const scenarios = [...new Set(DATA.map(r => r.scenario))];
 const avgScore = DATA.length ? Math.round(DATA.reduce((s,r) => s + r.score, 0) / DATA.length) : 0;
 const totalPassed = DATA.reduce((s,r) => s + r.testsPassed, 0);
 const totalTests = DATA.reduce((s,r) => s + r.testsTotal, 0);
 const passRate = totalTests ? Math.round(totalPassed/totalTests*100) : 0;
 const bestModel = models.map(m => {
  const mrs = DATA.filter(r => r.model === m);
  return { model: m, avg: Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length) };
 }).sort((a,b) => b.avg - a.avg)[0];
 const fastestModel = models.map(m => {
  const mrs = DATA.filter(r => r.model === m);
  return { model: m, speed: Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length) };
 }).sort((a,b) => b.speed - a.speed)[0];
 document.getElementById('cards').innerHTML = `
  <div class="card"><div class="label">Keskiarvo</div><div class="value">${starsFor(avgScore)}</div><div class="sub">${avgScore} pistetta</div></div>
  <div class="card"><div class="label">Testien läpäisy</div><div class="value">${passRate}%</div><div class="sub">${totalPassed}/${totalTests} testiä</div></div>
  <div class="card"><div class="label">Paras malli</div><div class="value" style="font-size:1.2rem">${bestModel?.model || '-'}</div><div class="sub">${bestModel?.avg || 0}p</div></div>
  <div class="card"><div class="label">Nopein</div><div class="value" style="font-size:1.2rem">${fastestModel?.model || '-'}</div><div class="sub">${fastestModel?.speed || 0} tok/s</div></div>
  <div class="card"><div class="label">Malleja</div><div class="value">${models.length}</div><div class="sub">${scenarios.length} skenaariota</div></div>
  <div class="card"><div class="label">Kokonaisaika</div><div class="value">${(totalTime/1000/60).toFixed(1)}</div><div class="sub">minuuttia</div></div>
 `;
 // Summary table
 const sumHead = document.querySelector('#summary-table thead');
 const sumBody = document.querySelector('#summary-table tbody');
 sumHead.innerHTML = '<tr><th>Malli</th>' + scenarios.map(s => `<th>${s}</th>`).join('') + '<th>Yht.</th><th>Out tok</th><th>Aika</th><th>tok/s</th><th>Pisteet</th></tr>';
 const modelRows = models.map(m => {
  const mrs = DATA.filter(r => r.model === m);
  const tp = mrs.reduce((s,r) => s + r.testsPassed, 0);
  const tt = mrs.reduce((s,r) => s + r.testsTotal, 0);
  const tok = mrs.reduce((s,r) => s + r.totalTokens, 0);
  const time = mrs.reduce((s,r) => s + r.totalDurationMs, 0);
  const speed = Math.round(mrs.reduce((s,r) => s + r.avgTokPerSec, 0) / mrs.length);
  const avg = Math.round(mrs.reduce((s,r) => s + r.score, 0) / mrs.length);
  const scenCols = scenarios.map(s => {
    const r = mrs.find(r => r.scenario === s);
    if (!r) return '<td>-</td>';
    return `<td class="${cls(r)}">${pctBar(r.testsPassed, r.testsTotal, 60)} <span style="color:var(--dim)">${(r.totalDurationMs/1000).toFixed(0)}s</span></td>`;
  }).join('');
  return { avg, html: `<tr><td class="model-name">${m}</td>${scenCols}<td>${pctBar(tp, tt)}</td><td>${(tok/1000).toFixed(1)}K</td><td>${(time/1000).toFixed(0)}s</td><td>${speed}</td><td><span class="stars">${starsFor(avg)}</span> ${avg}p</td></tr>` };
 }).sort((a,b) => b.avg - a.avg);
 sumBody.innerHTML = modelRows.map(r => r.html).join('');
 // Results table
 const resHead = document.querySelector('#results-table thead');
 const resBody = document.querySelector('#results-table tbody');
 const resCols = ['Malli','Skenaario','Speksi','Testit','Korjaus','Ctx','Out tok','Aika','tok/s','Pisteet'];
 resHead.innerHTML = '<tr>' + resCols.map((c,i) => `<th data-col="${i}">${c}</th>`).join('') + '</tr>';
 let sortCol = 9, sortAsc = false;
 function renderResults() {
  const sorted = [...DATA].sort((a,b) => {
    const vals = [
      [a.model, b.model],
      [a.scenario, b.scenario],
      [a.specEntities, b.specEntities],
      [a.testsPassed/Math.max(a.testsTotal,1), b.testsPassed/Math.max(b.testsTotal,1)],
      [a.fixRounds, b.fixRounds],
      [a.promptTokensEst, b.promptTokensEst],
      [a.totalTokens, b.totalTokens],
      [a.totalDurationMs, b.totalDurationMs],
      [a.avgTokPerSec, b.avgTokPerSec],
      [a.score, b.score],
    ][sortCol];
    const cmp = typeof vals[0] === 'string' ? vals[0].localeCompare(vals[1]) : vals[0] - vals[1];
    return sortAsc ? cmp : -cmp;
  });
  resBody.innerHTML = sorted.map(r => {
    const c = cls(r);
    return `<tr>
      <td class="model-name">${r.model}</td>
      <td>${r.scenario}</td>
      <td>${r.specOk ? `✓ ${r.specEntities}e` : '<span class="fail">✗</span>'}</td>
      <td class="${c}">${pctBar(r.testsPassed, r.testsTotal)}</td>
      <td>${r.fixRounds > 0 ? r.fixRounds + '×' : '-'}</td>
      <td>${r.promptTokensEst > 0 ? '~'+(r.promptTokensEst/1000).toFixed(1)+'K' : '-'}</td>
      <td>${r.totalTokens > 0 ? (r.totalTokens/1000).toFixed(1)+'K' : '-'}</td>
      <td>${(r.totalDurationMs/1000).toFixed(0)}s</td>
      <td>${r.avgTokPerSec.toFixed(0)}</td>
      <td><span class="stars">${r.stars}</span> ${r.score}p</td>
    </tr>`;
  }).join('');
  document.querySelectorAll('#results-table th').forEach((th,i) => {
    th.className = i === sortCol ? (sortAsc ? 'sorted-asc' : 'sorted-desc') : '';
  });
 }
 document.querySelector('#results-table thead').addEventListener('click', e => {
  const col = parseInt(e.target.dataset.col);
  if (isNaN(col)) return;
  if (sortCol === col) sortAsc = !sortAsc;
  else { sortCol = col; sortAsc = false; }
  renderResults();
 });
 renderResults();
 </script>
 </body>
 </html>
--- a/kipina-codebench/results/2026-04-14T10-59.json
+++ b/kipina-codebench/results/2026-04-14T10-59.json
@@ -0,0 +1,69 @@
 [
  {
    "model": "qwen3:8b",
    "scenario": "blog",
    "reqOk": true,
    "specOk": true,
    "specEntities": 2,
    "validationIssues": 0,
    "fixRounds": 1,
    "testsTotal": 11,
    "testsPassed": 11,
    "testsFailed": 0,
    "totalDurationMs": 64124,
    "totalTokens": 5689,
    "avgTokPerSec": 98.61378134916481,
    "promptChars": 12098,
    "promptTokensEst": 3025,
    "score": 90,
    "stars": "★★★★★",
    "error": null,
    "profile": "small",
    "promptName": "code-small",
    "round": 1
  },
  {
    "model": "qwen3:8b",
    "scenario": "blog",
    "reqOk": true,
    "specOk": true,
    "specEntities": 2,
    "validationIssues": 0,
    "fixRounds": 3,
    "testsTotal": 0,
    "testsPassed": 0,
    "testsFailed": 0,
    "totalDurationMs": 126014,
    "totalTokens": 11162,
    "avgTokPerSec": 97.09858655726343,
    "promptChars": 12101,
    "promptTokensEst": 3025,
    "score": 0,
    "stars": "☆☆☆☆☆",
    "error": "Testit kaatuivat",
    "profile": "small",
    "promptName": "code-small",
    "round": 2
  },
  {
    "model": "qwen3:8b",
    "scenario": "blog",
    "reqOk": true,
    "specOk": false,
    "specEntities": 0,
    "validationIssues": 0,
    "fixRounds": 0,
    "testsTotal": 0,
    "testsPassed": 0,
    "testsFailed": 0,
    "totalDurationMs": 0,
    "totalTokens": 0,
    "avgTokPerSec": 0,
    "promptChars": 0,
    "promptTokensEst": 0,
    "score": 0,
    "stars": "",
    "error": "JSON-speksi epäonnistui",
    "round": 3
  }
 ]