Benchmark: kultainen esimerkki + zensical-dokumentointiohjeet

- golden-examples/todo/: 6/6 PASS referenssitoteutus
  - SQLAlchemy 2.0 (DeclarativeBase, Mapped, mapped_column)
  - Pydantic v2 (ConfigDict)
  - PEP 621 pyproject.toml, Python >=3.14
  - Uniikki testidata per testi
- CODE_SYSTEM päivitetty: few-shot kultaisesta esimerkistä
- DOCUMENTATION.md: zensical-dokumentointiohjeet
This commit is contained in:
2026-04-14 07:28:47 +03:00
parent 8f154a578c
commit d6a544909c
7 changed files with 311 additions and 17 deletions

View File

@@ -11,7 +11,11 @@
*/
import { execSync } from 'child_process';
import { writeFileSync, mkdirSync, rmSync, existsSync } from 'fs';
import { writeFileSync, readFileSync, mkdirSync, rmSync, existsSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
// === CLI-argumentit ===
const args = process.argv.slice(2);
@@ -141,15 +145,29 @@ Blog → Author: name,email,bio(Text|None) / Post: title, content(Text), author_
const FIX_SYSTEM = 'You are a Python code fixer. Return ONLY the corrected Python file. No markdown fences, no explanations — just valid Python code.';
// === Kultainen esimerkki ===
const GOLDEN_DIR = join(__dirname, 'golden-examples', 'todo');
const GOLDEN_FILES = ['models.py', 'schemas.py', 'main.py', 'test_main.py', 'pyproject.toml'];
function loadGoldenExample() {
if (!existsSync(GOLDEN_DIR)) return '';
let example = '\nREFERENCE IMPLEMENTATION (todo project — follow this exact structure, style, and conventions):\n\n';
for (const f of GOLDEN_FILES) {
const path = join(GOLDEN_DIR, f);
if (existsSync(path)) example += `=== ${f} ===\n${readFileSync(path, 'utf-8').trim()}\n\n`;
}
return example;
}
const GOLDEN_EXAMPLE = loadGoldenExample();
const CODE_SYSTEM = `You are a Python backend developer. Generate a complete FastAPI project with SQLAlchemy and SQLite.
Given the project requirements and JSON specification, generate these 5 files:
Given the project requirements, JSON specification, and a REFERENCE IMPLEMENTATION, generate these 5 files:
1. models.py - SQLAlchemy models with database setup (create_engine, declarative_base, sessionmaker, Base.metadata.create_all)
2. schemas.py - Pydantic schemas (Create + Response for each entity, use ConfigDict(from_attributes=True))
3. main.py - FastAPI application with full CRUD endpoints for each entity
4. test_main.py - Pytest tests using TestClient with separate test database and dependency override
5. pyproject.toml - Project configuration with dependencies
1. models.py SQLAlchemy 2.0: DeclarativeBase, Mapped, mapped_column (NOT legacy declarative_base)
2. schemas.py Pydantic v2: ConfigDict(from_attributes=True) (NOT class Config)
3. main.py FastAPI CRUD endpoints for each entity
4. test_main.py Pytest with TestClient, separate test.db, unique test data per test
5. pyproject.toml PEP 621 [project] format (NOT [tool.poetry])
OUTPUT FORMAT — use these exact markers to separate files:
@@ -168,18 +186,17 @@ OUTPUT FORMAT — use these exact markers to separate files:
=== pyproject.toml ===
<toml content>
DOCUMENTATION — every file must have a one-line module docstring. Classes get a one-line docstring. Keep it zensical: say what it IS, not what it does. No filler.
RULES:
- SQLite: create_engine("sqlite:///./app.db", connect_args={"check_same_thread": False})
- Each model: auto-increment "id" Column(Integer, primary_key=True, index=True)
- Schemas: BaseModel with ConfigDict(from_attributes=True) for Response variants
- Endpoints per entity: POST (create, 201), GET (list), GET by id (404 if missing), PUT (update), DELETE (204)
- Tests: separate test.db, override get_db dependency, use TestClient
- pyproject.toml: fastapi, uvicorn[standard], sqlalchemy, pytest, httpx
- Status fields: String(20) with default, NEVER Enum
- Follow the REFERENCE IMPLEMENTATION patterns exactly
- SQLAlchemy 2.0: DeclarativeBase + Mapped + mapped_column (not Column())
- Python type unions: str | None (not Optional[str])
- pyproject.toml: PEP 621 [project] format, requires-python = ">=3.14"
- Tests: unique descriptive data per test, NOT generic "test_title" strings
- Absolute imports only (from models import ..., from schemas import ...)
- Python booleans: True/False/None (not true/false/null/none)
- NO markdown fences inside file content — just raw code
- Every _id foreign key field MUST have ForeignKey("table.id") constraint`;
- Only test endpoints that exist in main.py — no extra tests`;
// === Tiedostoparseri LLM-vastauksesta ===
function parseGeneratedFiles(text) {
@@ -285,7 +302,7 @@ async function runPipeline(model, scenario) {
// 3. LLM-koodigenerointi
console.log(` [3/5] Koodigenerointi (LLM)...`);
const codePrompt = `PROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all 5 files.`;
const codePrompt = `${GOLDEN_EXAMPLE}\n---\n\nPROJECT REQUIREMENTS:\n${req.text}\n\nJSON SPECIFICATION:\n${JSON.stringify(spec, null, 2)}\n\nGenerate the complete project with all 5 files. Follow the reference implementation patterns exactly.`;
const codeResp = await ollamaChat(model, codePrompt, CODE_SYSTEM, 8192);
timings.push(codeResp);
writeFileSync(`${dir}/_code_raw.txt`, codeResp.text);