diff --git a/kipina-codebench/benchmark.mjs b/kipina-codebench/benchmark.mjs index 30132c1..e7b8975 100644 --- a/kipina-codebench/benchmark.mjs +++ b/kipina-codebench/benchmark.mjs @@ -50,6 +50,12 @@ const FIX_SYSTEM = loadPrompt('fix'); // === Mallikohtaiset profiilit === const PROFILES = JSON.parse(readFileSync(join(__dirname, 'profiles.json'), 'utf-8')); +function getGoldenForModel(model) { + const modelConf = PROFILES.models[model]; + const goldenFile = modelConf?.golden || 'todo.md'; + return goldenFile; +} + function getCodePromptForModel(model) { const modelConf = PROFILES.models[model]; const profile = modelConf?.profile || PROFILES.default_profile; @@ -82,16 +88,16 @@ const LANG_CONFIG = { }; const LCONF = LANG_CONFIG[LANG] || LANG_CONFIG.python; -function loadGoldenExample() { +function loadGoldenExample(model) { // --compact: käytä tiivistettyä templaattia if (COMPACT_MODE) { const compactFile = LANG === 'rust' ? 'golden-compact-rs.md' : 'golden-compact-py.md'; const compactPath = join(__dirname, 'prompts', compactFile); if (existsSync(compactPath)) return '\n' + readFileSync(compactPath, 'utf-8').trim() + '\n'; } - // Markdown golden example (koodi + selitykset) - const mdName = LANG === 'rust' ? 'todo-rs.md' : 'todo.md'; - const mdPath = join(GOLDEN_DIR, mdName); + // Mallikohtainen golden example profiilista + const goldenFile = model ? getGoldenForModel(model) : (LANG === 'rust' ? 'todo-rs.md' : 'todo.md'); + const mdPath = join(GOLDEN_DIR, goldenFile); if (existsSync(mdPath)) return '\n' + readFileSync(mdPath, 'utf-8').trim() + '\n'; // Fallback: erilliset tiedostot const todoDir = join(GOLDEN_DIR, LCONF.goldenDir); @@ -103,7 +109,6 @@ function loadGoldenExample() { } return example; } -const GOLDEN_EXAMPLE = loadGoldenExample(); // === Ajattelutagien siivous (gemma4, qwen3/3.5 ym.) === function stripThinking(text) { @@ -322,7 +327,8 @@ async function runPipeline(model, scenario) { // 3. LLM-koodigenerointi console.log(` [3/5] Koodigenerointi (LLM)...`); const fileCount = LCONF.required.length; - const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`; + const goldenExample = loadGoldenExample(model); + const codePrompt = `${goldenExample}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all ${fileCount} files. Follow the reference implementation patterns exactly.`; result.promptChars = CODE_SYSTEM.length + codePrompt.length; result.promptTokensEst = Math.round(result.promptChars / 4); const codeTokens = LANG === 'rust' ? 12288 : 8192; diff --git a/kipina-codebench/golden-examples/todo-readme.md b/kipina-codebench/golden-examples/todo-readme.md new file mode 100644 index 0000000..5bf785e --- /dev/null +++ b/kipina-codebench/golden-examples/todo-readme.md @@ -0,0 +1,217 @@ +# Todo App — FastAPI + SQLAlchemy + SQLite + +A simple todo CRUD API. Uses only the fields defined in the spec — no extra fields. + +## Project Structure + +``` +models.py # SQLAlchemy 2.0 models +schemas.py # Pydantic v2 schemas +main.py # FastAPI CRUD endpoints +test_main.py # Pytest with TestClient +``` + +## models.py + +```python +"""Tietokantamallit — SQLAlchemy 2.0, Mapped-tyypitys, SQLite.""" + +from datetime import date + +from sqlalchemy import String, Text, Date, create_engine +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, sessionmaker + +DATABASE_URL = "sqlite:///./app.db" +engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False}) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + + +class Base(DeclarativeBase): + pass + + +class Todo(Base): + """Tehtävä — otsikko, kuvaus, deadline, prioriteetti ja status.""" + + __tablename__ = "todos" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + title: Mapped[str] = mapped_column(String(255)) + description: Mapped[str | None] = mapped_column(Text, default=None) + due_date: Mapped[date | None] = mapped_column(Date, default=None) + priority: Mapped[int] = mapped_column(default=1) + status: Mapped[str] = mapped_column(String(20), default="pending") + + +Base.metadata.create_all(bind=engine) +``` + +## schemas.py + +```python +"""Pydantic v2 -skeemat — Create sisääntulolle, Response vastaukselle.""" + +from datetime import date + +from pydantic import BaseModel, ConfigDict + + +class TodoCreate(BaseModel): + """Uuden tehtävän luonti. Pakolliset: title.""" + + title: str + description: str | None = None + due_date: date | None = None + priority: int = 1 + status: str = "pending" + + +class TodoResponse(TodoCreate): + """Palautettava tehtävä — sisältää id:n.""" + + id: int + model_config = ConfigDict(from_attributes=True) +``` + +## main.py + +```python +"""FastAPI CRUD — yksi endpoint-setti per entiteetti.""" + +from fastapi import FastAPI, Depends, HTTPException +from sqlalchemy.orm import Session + +from models import SessionLocal, Todo +from schemas import TodoCreate, TodoResponse + +app = FastAPI() + + +def get_db(): + """Tietokantasessio per pyyntö.""" + db = SessionLocal() + try: + yield db + finally: + db.close() + + +@app.post("/todos/", response_model=TodoResponse, status_code=201) +def create_todo(item: TodoCreate, db: Session = Depends(get_db)): + db_item = Todo(**item.model_dump()) + db.add(db_item) + db.commit() + db.refresh(db_item) + return db_item + + +@app.get("/todos/", response_model=list[TodoResponse]) +def list_todos(db: Session = Depends(get_db)): + return db.query(Todo).all() + + +@app.get("/todos/{item_id}", response_model=TodoResponse) +def get_todo(item_id: int, db: Session = Depends(get_db)): + item = db.query(Todo).filter(Todo.id == item_id).first() + if not item: + raise HTTPException(status_code=404, detail="Todo not found") + return item + + +@app.put("/todos/{item_id}", response_model=TodoResponse) +def update_todo(item_id: int, item: TodoCreate, db: Session = Depends(get_db)): + db_item = db.query(Todo).filter(Todo.id == item_id).first() + if not db_item: + raise HTTPException(status_code=404, detail="Todo not found") + for key, value in item.model_dump().items(): + setattr(db_item, key, value) + db.commit() + db.refresh(db_item) + return db_item + + +@app.delete("/todos/{item_id}", status_code=204) +def delete_todo(item_id: int, db: Session = Depends(get_db)): + db_item = db.query(Todo).filter(Todo.id == item_id).first() + if not db_item: + raise HTTPException(status_code=404, detail="Todo not found") + db.delete(db_item) + db.commit() +``` + +## test_main.py + +Exactly 6 tests per entity. Database is shared — use `>= 1` not `== 1` in list tests. +For child entities with foreign keys: create parent FIRST, then child with parent's id. + +```python +"""Pytest — TestClient, erillinen test.db, uniikki data per testi.""" + +from fastapi.testclient import TestClient +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from main import app, get_db +from models import Base + +test_engine = create_engine( + "sqlite:///./test.db", connect_args={"check_same_thread": False} +) +TestSession = sessionmaker(autocommit=False, autoflush=False, bind=test_engine) +Base.metadata.create_all(bind=test_engine) + + +def override_get_db(): + db = TestSession() + try: + yield db + finally: + db.close() + + +app.dependency_overrides[get_db] = override_get_db +client = TestClient(app) + + +def test_create_todo(): + response = client.post("/todos/", json={"title": "Osta maitoa", "priority": 2}) + assert response.status_code == 201 + assert response.json()["title"] == "Osta maitoa" + assert "id" in response.json() + + +def test_list_todos(): + client.post("/todos/", json={"title": "Listattava tehtävä"}) + response = client.get("/todos/") + assert response.status_code == 200 + assert len(response.json()) >= 1 + + +def test_get_todo_by_id(): + created = client.post("/todos/", json={"title": "Haettava tehtävä"}).json() + response = client.get(f"/todos/{created['id']}") + assert response.status_code == 200 + assert response.json()["id"] == created["id"] + + +def test_get_todo_not_found(): + response = client.get("/todos/99999") + assert response.status_code == 404 + + +def test_update_todo(): + created = client.post("/todos/", json={"title": "Vanha otsikko"}).json() + response = client.put( + f"/todos/{created['id']}", json={"title": "Uusi otsikko"} + ) + assert response.status_code == 200 + assert response.json()["title"] == "Uusi otsikko" + + +def test_delete_todo(): + created = client.post("/todos/", json={"title": "Poistettava"}).json() + response = client.delete(f"/todos/{created['id']}") + assert response.status_code == 204 + response = client.get(f"/todos/{created['id']}") + assert response.status_code == 404 +``` diff --git a/kipina-codebench/profiles.json b/kipina-codebench/profiles.json index d129276..167a429 100644 --- a/kipina-codebench/profiles.json +++ b/kipina-codebench/profiles.json @@ -12,9 +12,9 @@ "profile": "small", "role": "primary", "prompt": "code-small", - "golden": "todo.md", + "golden": "todo-readme.md", "vram": "8GB", - "notes": "Kevyt pääkooderi. Todo/users 100p, blog heikko. Lyhyt prompti toimii paremmin." + "notes": "Kevyt pääkooderi. Todo/users 100p, blog heikko. README-muoto golden examplelle." }, "codestral:22b": { "profile": "large", diff --git a/kipina-codebench/results/2026-04-14T10-59.html b/kipina-codebench/results/2026-04-14T10-59.html new file mode 100644 index 0000000..4af497c --- /dev/null +++ b/kipina-codebench/results/2026-04-14T10-59.html @@ -0,0 +1,183 @@ + + + + + +Kipina Model Benchmark + + + + +

Kipina Model Benchmark

+
+ +
+ +

Mallikohtainen yhteenveto

+
+ +

Kaikki tulokset

+
+ + + + diff --git a/kipina-codebench/results/2026-04-14T10-59.json b/kipina-codebench/results/2026-04-14T10-59.json new file mode 100644 index 0000000..492c110 --- /dev/null +++ b/kipina-codebench/results/2026-04-14T10-59.json @@ -0,0 +1,69 @@ +[ + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 1, + "testsTotal": 11, + "testsPassed": 11, + "testsFailed": 0, + "totalDurationMs": 64124, + "totalTokens": 5689, + "avgTokPerSec": 98.61378134916481, + "promptChars": 12098, + "promptTokensEst": 3025, + "score": 90, + "stars": "★★★★★", + "error": null, + "profile": "small", + "promptName": "code-small", + "round": 1 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": true, + "specEntities": 2, + "validationIssues": 0, + "fixRounds": 3, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 126014, + "totalTokens": 11162, + "avgTokPerSec": 97.09858655726343, + "promptChars": 12101, + "promptTokensEst": 3025, + "score": 0, + "stars": "☆☆☆☆☆", + "error": "Testit kaatuivat", + "profile": "small", + "promptName": "code-small", + "round": 2 + }, + { + "model": "qwen3:8b", + "scenario": "blog", + "reqOk": true, + "specOk": false, + "specEntities": 0, + "validationIssues": 0, + "fixRounds": 0, + "testsTotal": 0, + "testsPassed": 0, + "testsFailed": 0, + "totalDurationMs": 0, + "totalTokens": 0, + "avgTokPerSec": 0, + "promptChars": 0, + "promptTokensEst": 0, + "score": 0, + "stars": "", + "error": "JSON-speksi epäonnistui", + "round": 3 + } +] \ No newline at end of file