From e360896436014818efa6c49cecf948b83d2ca086 Mon Sep 17 00:00:00 2001 From: jaakko Date: Tue, 14 Apr 2026 12:46:06 +0300 Subject: [PATCH] =?UTF-8?q?CodeBench=20Taso=204:=20itsekorjaava=20looppi?= =?UTF-8?q?=20=E2=80=94=20sy=C3=B6t=C3=A4=20pytest-virhe=20mallille?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jos testit epäonnistuvat, LLM saa virheilmoituksen + koodin ja korjaa. Max 3 korjauskierrosta. Testattu: qwen3:8b users 0/6 → korjaus → 6/6. --- kipina-codebench/benchmark.mjs | 73 ++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/kipina-codebench/benchmark.mjs b/kipina-codebench/benchmark.mjs index 2bc71e0..50c24e4 100644 --- a/kipina-codebench/benchmark.mjs +++ b/kipina-codebench/benchmark.mjs @@ -346,30 +346,61 @@ async function runPipeline(model, scenario) { } result.fixRounds = fixRound; - // Kirjoita LLM:n generoimat tiedostot (luo src/ ja tests/ alihakemistot tarvittaessa) - for (const [fn, content] of Object.entries(files)) { - const filePath = join(dir, fn); - mkdirSync(dirname(filePath), { recursive: true }); - writeFileSync(filePath, content); - } - - // 5. Testit Docker-kontissa - const testLabel = LANG === 'rust' ? 'Cargo test (Docker)' : 'Pytest (Docker)'; - console.log(` [5/5] ${testLabel}...`); + // 5. Testit Docker-kontissa + itsekorjaava looppi (Taso 4) + const testLabel = LANG === 'rust' ? 'Cargo test' : 'Pytest'; const dockerTimeout = LANG === 'rust' ? 300000 : 120000; - try { - const testOut = execSync( - `docker run --rm -v "${dir}:/src:ro" ${LCONF.dockerImage} 2>&1`, - { timeout: dockerTimeout, encoding: 'utf-8' } - ); - writeFileSync(`${dir}/_testout.txt`, testOut); + const MAX_TEST_FIX = 3; + + for (let testRound = 0; testRound <= MAX_TEST_FIX; testRound++) { + // Kirjoita tiedostot levylle + for (const [fn, content] of Object.entries(files)) { + const filePath = join(dir, fn); + mkdirSync(dirname(filePath), { recursive: true }); + writeFileSync(filePath, content); + } + + const roundLabel = testRound > 0 ? ` (korjaus ${testRound}/${MAX_TEST_FIX})` : ''; + console.log(` [5/5] ${testLabel}${roundLabel}...`); + + let testOut = ''; + try { + testOut = execSync( + `docker run --rm -v "${dir}:/src:ro" ${LCONF.dockerImage} 2>&1`, + { timeout: dockerTimeout, encoding: 'utf-8' } + ); + } catch (e) { + testOut = e.stdout || e.stderr || e.message || ''; + } + writeFileSync(`${dir}/_testout_${testRound}.txt`, testOut); Object.assign(result, parseTestOutput(testOut)); - } catch (e) { - const output = e.stdout || e.stderr || e.message || ''; - writeFileSync(`${dir}/_testout.txt`, output); - Object.assign(result, parseTestOutput(output)); - if (result.testsTotal === 0) result.error = 'Testit kaatuivat'; + + // Kaikki testit läpi → valmis + if (result.testsTotal > 0 && result.testsPassed === result.testsTotal) break; + + // Viimeinen kierros tai ei enää korjausmahdollisuutta + if (testRound >= MAX_TEST_FIX) { + if (result.testsTotal === 0) result.error = 'Testit kaatuivat'; + break; + } + + // Itsekorjaus: syötä virhe + koodi mallille + const errorLines = testOut.split('\n').filter(l => /^E |FAILED|ERROR|error\[E/.test(l)).slice(0, 20).join('\n'); + if (!errorLines) break; // Ei parsittavia virheitä + + console.log(` [5/5] Itsekorjaus: ${result.testsFailed || 'virhe'}...`); + const allCode = Object.entries(files).map(([fn, c]) => `=== ${fn} ===\n${c}`).join('\n\n'); + const fixPrompt = `The following test errors occurred. Fix the code so ALL tests pass. Return ALL files with === markers.\n\nERRORS:\n${errorLines}\n\nCURRENT CODE:\n${allCode}`; + const fixResp = await ollamaChat(model, fixPrompt, CODE_SYSTEM, LANG === 'rust' ? 12288 : 8192); + timings.push(fixResp); + + const fixedFiles = parseGeneratedFiles(fixResp.text); + // Päivitä vain tiedostot jotka malli palautti + for (const [fn, content] of Object.entries(fixedFiles)) { + if (LCONF.required.includes(fn)) files[fn] = content; + } + result.fixRounds++; } + writeFileSync(`${dir}/_testout.txt`, ''); // Symlink viimeisimpään } catch (e) { result.error = e.message; }