GGUF Q4_K_M -tuki 3B-mallille: kvantisoidtu versio (~1.9 GB) mahtuu selaimeen

Safetensors-muotoinen 3B (~6.2 GB) aiheutti WASM capacity overflow.
Nyt käytetään candle quantized_qwen2 -moduulia GGUF-tiedoston lataamiseen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-06 13:54:23 +03:00
parent 99b011e399
commit d18314bfc8
2 changed files with 48 additions and 34 deletions

View File

@@ -1,6 +1,8 @@
use candle_core::{Device, Tensor, DType};
use candle_core::quantized::gguf_file;
use candle_nn::VarBuilder;
use candle_transformers::models::qwen2::{Config as QwenConfig, ModelForCausalLM as QwenModel};
use candle_transformers::models::quantized_qwen2::ModelWeights as QwenQuantizedModel;
use wasm_bindgen::JsCast;
use std::cell::RefCell;
use std::rc::Rc;
@@ -16,13 +18,36 @@ macro_rules! console_log {
const MODEL_05B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/model.safetensors";
const TOKENIZER_05B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/tokenizer.json";
// 3B — parempi laatu, vaatii enemmän muistia (~6 GB lataus, ~12 GB RAM)
const MODEL_3B_PART1_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/model-00001-of-00002.safetensors";
const MODEL_3B_PART2_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/model-00002-of-00002.safetensors";
// 3B GGUF Q4_K_M — kvantisoidtu, mahtuu selaimeen (~1.9 GB)
const MODEL_3B_GGUF_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct-GGUF/resolve/main/qwen2.5-coder-3b-instruct-q4_k_m.gguf";
const TOKENIZER_3B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/tokenizer.json";
enum CoderModel {
Full(QwenModel),
Quantized(QwenQuantizedModel),
}
impl CoderModel {
fn forward(&mut self, x: &Tensor, pos: usize) -> candle_core::Result<Tensor> {
match self {
CoderModel::Full(m) => m.forward(x, pos),
CoderModel::Quantized(m) => m.forward(x, pos),
}
}
fn clear_kv_cache(&mut self) {
match self {
CoderModel::Full(m) => m.clear_kv_cache(),
CoderModel::Quantized(_) => {
// Quantized model nollaa KV-cachen automaattisesti kun forward kutsutaan pos=0:lla
// (ks. quantized_qwen2.rs rivi 118: if index_pos == 0)
}
}
}
}
struct CachedModel {
model: QwenModel,
model: CoderModel,
tokenizer: tokenizers::Tokenizer,
is_3b: bool,
}
@@ -189,43 +214,32 @@ async fn get_or_build_model(use_3b: bool, ws: &Rc<RefCell<WebSocket>>) -> Result
.map_err(|e| format!("Tokenizer: {}", e))?;
// Painot
let tensors = if use_3b {
let part1 = ensure_cached("coder3b-model-part1.safetensors", MODEL_3B_PART1_URL, ws).await?;
let part2 = ensure_cached("coder3b-model-part2.safetensors", MODEL_3B_PART2_URL, ws).await?;
console_log!("[Coder] Rakennetaan 3B-mallia...");
let mut all_tensors = candle_core::safetensors::load_buffer(&part1[..], &device)
.map_err(|e| format!("Part1: {}", e))?;
let tensors2 = candle_core::safetensors::load_buffer(&part2[..], &device)
.map_err(|e| format!("Part2: {}", e))?;
all_tensors.extend(tensors2);
all_tensors
let model = if use_3b {
// GGUF Q4_K_M — kvantisoidtu 3B-malli (~1.9 GB)
let gguf_bytes = ensure_cached("coder3b-q4km.gguf", MODEL_3B_GGUF_URL, ws).await?;
console_log!("[Coder] Rakennetaan kvantisoidun 3B-mallia (Q4_K_M)...");
let mut cursor = std::io::Cursor::new(&gguf_bytes[..]);
let content = gguf_file::Content::read(&mut cursor)
.map_err(|e| format!("GGUF parse: {}", e))?;
let qmodel = QwenQuantizedModel::from_gguf(content, &mut cursor, &device)
.map_err(|e| format!("GGUF model: {}", e))?;
CoderModel::Quantized(qmodel)
} else {
let model_bytes = ensure_cached("coder05b-model.safetensors", MODEL_05B_URL, ws).await?;
console_log!("[Coder] Rakennetaan 0.5B-mallia...");
candle_core::safetensors::load_buffer(&model_bytes[..], &device)
.map_err(|e| format!("Safetensors: {}", e))?
};
let vb = VarBuilder::from_tensors(tensors, dtype, &device);
let config = if use_3b {
QwenConfig {
vocab_size: 151936, hidden_size: 2048, intermediate_size: 11008,
num_hidden_layers: 36, num_attention_heads: 16, num_key_value_heads: 2,
max_position_embeddings: 32768, sliding_window: 32768, max_window_layers: 36,
tie_word_embeddings: true, rope_theta: 1000000.0, rms_norm_eps: 1e-6,
use_sliding_window: false, hidden_act: candle_nn::Activation::Silu,
}
} else {
QwenConfig {
let tensors = candle_core::safetensors::load_buffer(&model_bytes[..], &device)
.map_err(|e| format!("Safetensors: {}", e))?;
let config = QwenConfig {
vocab_size: 151936, hidden_size: 896, intermediate_size: 4864,
num_hidden_layers: 24, num_attention_heads: 14, num_key_value_heads: 2,
max_position_embeddings: 32768, sliding_window: 32768, max_window_layers: 21,
tie_word_embeddings: true, rope_theta: 1000000.0, rms_norm_eps: 1e-6,
use_sliding_window: false, hidden_act: candle_nn::Activation::Silu,
}
};
let model = QwenModel::new(&config, vb).map_err(|e| format!("Malli: {}", e))?;
let vb = VarBuilder::from_tensors(tensors, dtype, &device);
let qwen = QwenModel::new(&config, vb).map_err(|e| format!("Malli: {}", e))?;
CoderModel::Full(qwen)
};
console_log!("[Coder] Malli ladattu ja välimuistitettu");
MODEL_CACHE.with(|c| {

View File

@@ -2229,7 +2229,7 @@ Write the corrected code.`;
// Mallikatalogista valinta numerolla tai nimellä
const loadModels = [
{ id: '1', key: '05b', name: 'Qwen2.5-Coder:0.5B', size: '~990 MB', coderSize: '05b' },
{ id: '2', key: '3b', name: 'Qwen2.5-Coder:3B', size: '~6.2 GB', coderSize: '3b' },
{ id: '2', key: '3b', name: 'Qwen2.5-Coder:3B Q4', size: '~1.9 GB', coderSize: '3b' },
];
if (!arg) {
// Näytetään lista
@@ -2268,7 +2268,7 @@ Write the corrected code.`;
if (sub === 'models') {
termLog(' Käytettävissä olevat mallit:', '#c9d1d9');
termLog(' <span style="color:#58a6ff">1</span> qwen-coder Qwen2.5-Coder:0.5B <span style="color:#8b949e">~990 MB | koodin generointi</span>');
termLog(' <span style="color:#58a6ff">2</span> qwen-coder-3b Qwen2.5-Coder:3B <span style="color:#8b949e">~6.2 GB | parempi koodinlaatu</span>');
termLog(' <span style="color:#58a6ff">2</span> qwen-coder-3b Qwen2.5-Coder:3B Q4 <span style="color:#8b949e">~1.9 GB | kvantisoidtu, parempi laatu</span>');
termLog(' <span style="color:#58a6ff">3</span> smollm-135m SmolLM 135M <span style="color:#8b949e">~270 MB | kevyt, nopea</span>');
termLog(' <span style="color:#58a6ff">4</span> qwen-05b Qwen2.5:0.5B <span style="color:#8b949e">~990 MB | yleismalli</span>');
termLog(' <span style="color:#58a6ff">5</span> phi3-mini Phi-3 Mini <span style="color:#8b949e">~2.2 GB | Microsoftin malli</span>');