diff --git a/network-poc/node/src/qwen_coder.rs b/network-poc/node/src/qwen_coder.rs index 8891c5d..d0fb00d 100644 --- a/network-poc/node/src/qwen_coder.rs +++ b/network-poc/node/src/qwen_coder.rs @@ -1,6 +1,8 @@ use candle_core::{Device, Tensor, DType}; +use candle_core::quantized::gguf_file; use candle_nn::VarBuilder; use candle_transformers::models::qwen2::{Config as QwenConfig, ModelForCausalLM as QwenModel}; +use candle_transformers::models::quantized_qwen2::ModelWeights as QwenQuantizedModel; use wasm_bindgen::JsCast; use std::cell::RefCell; use std::rc::Rc; @@ -16,13 +18,36 @@ macro_rules! console_log { const MODEL_05B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/model.safetensors"; const TOKENIZER_05B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/tokenizer.json"; -// 3B — parempi laatu, vaatii enemmän muistia (~6 GB lataus, ~12 GB RAM) -const MODEL_3B_PART1_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/model-00001-of-00002.safetensors"; -const MODEL_3B_PART2_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/model-00002-of-00002.safetensors"; +// 3B GGUF Q4_K_M — kvantisoidtu, mahtuu selaimeen (~1.9 GB) +const MODEL_3B_GGUF_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct-GGUF/resolve/main/qwen2.5-coder-3b-instruct-q4_k_m.gguf"; const TOKENIZER_3B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/tokenizer.json"; +enum CoderModel { + Full(QwenModel), + Quantized(QwenQuantizedModel), +} + +impl CoderModel { + fn forward(&mut self, x: &Tensor, pos: usize) -> candle_core::Result { + match self { + CoderModel::Full(m) => m.forward(x, pos), + CoderModel::Quantized(m) => m.forward(x, pos), + } + } + + fn clear_kv_cache(&mut self) { + match self { + CoderModel::Full(m) => m.clear_kv_cache(), + CoderModel::Quantized(_) => { + // Quantized model nollaa KV-cachen automaattisesti kun forward kutsutaan pos=0:lla + // (ks. quantized_qwen2.rs rivi 118: if index_pos == 0) + } + } + } +} + struct CachedModel { - model: QwenModel, + model: CoderModel, tokenizer: tokenizers::Tokenizer, is_3b: bool, } @@ -189,43 +214,32 @@ async fn get_or_build_model(use_3b: bool, ws: &Rc>) -> Result .map_err(|e| format!("Tokenizer: {}", e))?; // Painot - let tensors = if use_3b { - let part1 = ensure_cached("coder3b-model-part1.safetensors", MODEL_3B_PART1_URL, ws).await?; - let part2 = ensure_cached("coder3b-model-part2.safetensors", MODEL_3B_PART2_URL, ws).await?; - console_log!("[Coder] Rakennetaan 3B-mallia..."); - let mut all_tensors = candle_core::safetensors::load_buffer(&part1[..], &device) - .map_err(|e| format!("Part1: {}", e))?; - let tensors2 = candle_core::safetensors::load_buffer(&part2[..], &device) - .map_err(|e| format!("Part2: {}", e))?; - all_tensors.extend(tensors2); - all_tensors + let model = if use_3b { + // GGUF Q4_K_M — kvantisoidtu 3B-malli (~1.9 GB) + let gguf_bytes = ensure_cached("coder3b-q4km.gguf", MODEL_3B_GGUF_URL, ws).await?; + console_log!("[Coder] Rakennetaan kvantisoidun 3B-mallia (Q4_K_M)..."); + let mut cursor = std::io::Cursor::new(&gguf_bytes[..]); + let content = gguf_file::Content::read(&mut cursor) + .map_err(|e| format!("GGUF parse: {}", e))?; + let qmodel = QwenQuantizedModel::from_gguf(content, &mut cursor, &device) + .map_err(|e| format!("GGUF model: {}", e))?; + CoderModel::Quantized(qmodel) } else { let model_bytes = ensure_cached("coder05b-model.safetensors", MODEL_05B_URL, ws).await?; console_log!("[Coder] Rakennetaan 0.5B-mallia..."); - candle_core::safetensors::load_buffer(&model_bytes[..], &device) - .map_err(|e| format!("Safetensors: {}", e))? - }; - - let vb = VarBuilder::from_tensors(tensors, dtype, &device); - let config = if use_3b { - QwenConfig { - vocab_size: 151936, hidden_size: 2048, intermediate_size: 11008, - num_hidden_layers: 36, num_attention_heads: 16, num_key_value_heads: 2, - max_position_embeddings: 32768, sliding_window: 32768, max_window_layers: 36, - tie_word_embeddings: true, rope_theta: 1000000.0, rms_norm_eps: 1e-6, - use_sliding_window: false, hidden_act: candle_nn::Activation::Silu, - } - } else { - QwenConfig { + let tensors = candle_core::safetensors::load_buffer(&model_bytes[..], &device) + .map_err(|e| format!("Safetensors: {}", e))?; + let config = QwenConfig { vocab_size: 151936, hidden_size: 896, intermediate_size: 4864, num_hidden_layers: 24, num_attention_heads: 14, num_key_value_heads: 2, max_position_embeddings: 32768, sliding_window: 32768, max_window_layers: 21, tie_word_embeddings: true, rope_theta: 1000000.0, rms_norm_eps: 1e-6, use_sliding_window: false, hidden_act: candle_nn::Activation::Silu, - } + }; + let vb = VarBuilder::from_tensors(tensors, dtype, &device); + let qwen = QwenModel::new(&config, vb).map_err(|e| format!("Malli: {}", e))?; + CoderModel::Full(qwen) }; - - let model = QwenModel::new(&config, vb).map_err(|e| format!("Malli: {}", e))?; console_log!("[Coder] Malli ladattu ja välimuistitettu"); MODEL_CACHE.with(|c| { diff --git a/network-poc/static/index.html b/network-poc/static/index.html index 7330a23..8b291d6 100644 --- a/network-poc/static/index.html +++ b/network-poc/static/index.html @@ -2229,7 +2229,7 @@ Write the corrected code.`; // Mallikatalogista valinta numerolla tai nimellä const loadModels = [ { id: '1', key: '05b', name: 'Qwen2.5-Coder:0.5B', size: '~990 MB', coderSize: '05b' }, - { id: '2', key: '3b', name: 'Qwen2.5-Coder:3B', size: '~6.2 GB', coderSize: '3b' }, + { id: '2', key: '3b', name: 'Qwen2.5-Coder:3B Q4', size: '~1.9 GB', coderSize: '3b' }, ]; if (!arg) { // Näytetään lista @@ -2268,7 +2268,7 @@ Write the corrected code.`; if (sub === 'models') { termLog(' Käytettävissä olevat mallit:', '#c9d1d9'); termLog(' 1 qwen-coder Qwen2.5-Coder:0.5B ~990 MB | koodin generointi'); - termLog(' 2 qwen-coder-3b Qwen2.5-Coder:3B ~6.2 GB | parempi koodinlaatu'); + termLog(' 2 qwen-coder-3b Qwen2.5-Coder:3B Q4 ~1.9 GB | kvantisoidtu, parempi laatu'); termLog(' 3 smollm-135m SmolLM 135M ~270 MB | kevyt, nopea'); termLog(' 4 qwen-05b Qwen2.5:0.5B ~990 MB | yleismalli'); termLog(' 5 phi3-mini Phi-3 Mini ~2.2 GB | Microsoftin malli');