GGUF Q4_K_M -tuki 3B-mallille: kvantisoidtu versio (~1.9 GB) mahtuu selaimeen
Safetensors-muotoinen 3B (~6.2 GB) aiheutti WASM capacity overflow. Nyt käytetään candle quantized_qwen2 -moduulia GGUF-tiedoston lataamiseen. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
use candle_core::{Device, Tensor, DType};
|
||||
use candle_core::quantized::gguf_file;
|
||||
use candle_nn::VarBuilder;
|
||||
use candle_transformers::models::qwen2::{Config as QwenConfig, ModelForCausalLM as QwenModel};
|
||||
use candle_transformers::models::quantized_qwen2::ModelWeights as QwenQuantizedModel;
|
||||
use wasm_bindgen::JsCast;
|
||||
use std::cell::RefCell;
|
||||
use std::rc::Rc;
|
||||
@@ -16,13 +18,36 @@ macro_rules! console_log {
|
||||
const MODEL_05B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/model.safetensors";
|
||||
const TOKENIZER_05B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/tokenizer.json";
|
||||
|
||||
// 3B — parempi laatu, vaatii enemmän muistia (~6 GB lataus, ~12 GB RAM)
|
||||
const MODEL_3B_PART1_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/model-00001-of-00002.safetensors";
|
||||
const MODEL_3B_PART2_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/model-00002-of-00002.safetensors";
|
||||
// 3B GGUF Q4_K_M — kvantisoidtu, mahtuu selaimeen (~1.9 GB)
|
||||
const MODEL_3B_GGUF_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct-GGUF/resolve/main/qwen2.5-coder-3b-instruct-q4_k_m.gguf";
|
||||
const TOKENIZER_3B_URL: &str = "https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct/resolve/main/tokenizer.json";
|
||||
|
||||
enum CoderModel {
|
||||
Full(QwenModel),
|
||||
Quantized(QwenQuantizedModel),
|
||||
}
|
||||
|
||||
impl CoderModel {
|
||||
fn forward(&mut self, x: &Tensor, pos: usize) -> candle_core::Result<Tensor> {
|
||||
match self {
|
||||
CoderModel::Full(m) => m.forward(x, pos),
|
||||
CoderModel::Quantized(m) => m.forward(x, pos),
|
||||
}
|
||||
}
|
||||
|
||||
fn clear_kv_cache(&mut self) {
|
||||
match self {
|
||||
CoderModel::Full(m) => m.clear_kv_cache(),
|
||||
CoderModel::Quantized(_) => {
|
||||
// Quantized model nollaa KV-cachen automaattisesti kun forward kutsutaan pos=0:lla
|
||||
// (ks. quantized_qwen2.rs rivi 118: if index_pos == 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct CachedModel {
|
||||
model: QwenModel,
|
||||
model: CoderModel,
|
||||
tokenizer: tokenizers::Tokenizer,
|
||||
is_3b: bool,
|
||||
}
|
||||
@@ -189,43 +214,32 @@ async fn get_or_build_model(use_3b: bool, ws: &Rc<RefCell<WebSocket>>) -> Result
|
||||
.map_err(|e| format!("Tokenizer: {}", e))?;
|
||||
|
||||
// Painot
|
||||
let tensors = if use_3b {
|
||||
let part1 = ensure_cached("coder3b-model-part1.safetensors", MODEL_3B_PART1_URL, ws).await?;
|
||||
let part2 = ensure_cached("coder3b-model-part2.safetensors", MODEL_3B_PART2_URL, ws).await?;
|
||||
console_log!("[Coder] Rakennetaan 3B-mallia...");
|
||||
let mut all_tensors = candle_core::safetensors::load_buffer(&part1[..], &device)
|
||||
.map_err(|e| format!("Part1: {}", e))?;
|
||||
let tensors2 = candle_core::safetensors::load_buffer(&part2[..], &device)
|
||||
.map_err(|e| format!("Part2: {}", e))?;
|
||||
all_tensors.extend(tensors2);
|
||||
all_tensors
|
||||
let model = if use_3b {
|
||||
// GGUF Q4_K_M — kvantisoidtu 3B-malli (~1.9 GB)
|
||||
let gguf_bytes = ensure_cached("coder3b-q4km.gguf", MODEL_3B_GGUF_URL, ws).await?;
|
||||
console_log!("[Coder] Rakennetaan kvantisoidun 3B-mallia (Q4_K_M)...");
|
||||
let mut cursor = std::io::Cursor::new(&gguf_bytes[..]);
|
||||
let content = gguf_file::Content::read(&mut cursor)
|
||||
.map_err(|e| format!("GGUF parse: {}", e))?;
|
||||
let qmodel = QwenQuantizedModel::from_gguf(content, &mut cursor, &device)
|
||||
.map_err(|e| format!("GGUF model: {}", e))?;
|
||||
CoderModel::Quantized(qmodel)
|
||||
} else {
|
||||
let model_bytes = ensure_cached("coder05b-model.safetensors", MODEL_05B_URL, ws).await?;
|
||||
console_log!("[Coder] Rakennetaan 0.5B-mallia...");
|
||||
candle_core::safetensors::load_buffer(&model_bytes[..], &device)
|
||||
.map_err(|e| format!("Safetensors: {}", e))?
|
||||
};
|
||||
|
||||
let vb = VarBuilder::from_tensors(tensors, dtype, &device);
|
||||
let config = if use_3b {
|
||||
QwenConfig {
|
||||
vocab_size: 151936, hidden_size: 2048, intermediate_size: 11008,
|
||||
num_hidden_layers: 36, num_attention_heads: 16, num_key_value_heads: 2,
|
||||
max_position_embeddings: 32768, sliding_window: 32768, max_window_layers: 36,
|
||||
tie_word_embeddings: true, rope_theta: 1000000.0, rms_norm_eps: 1e-6,
|
||||
use_sliding_window: false, hidden_act: candle_nn::Activation::Silu,
|
||||
}
|
||||
} else {
|
||||
QwenConfig {
|
||||
let tensors = candle_core::safetensors::load_buffer(&model_bytes[..], &device)
|
||||
.map_err(|e| format!("Safetensors: {}", e))?;
|
||||
let config = QwenConfig {
|
||||
vocab_size: 151936, hidden_size: 896, intermediate_size: 4864,
|
||||
num_hidden_layers: 24, num_attention_heads: 14, num_key_value_heads: 2,
|
||||
max_position_embeddings: 32768, sliding_window: 32768, max_window_layers: 21,
|
||||
tie_word_embeddings: true, rope_theta: 1000000.0, rms_norm_eps: 1e-6,
|
||||
use_sliding_window: false, hidden_act: candle_nn::Activation::Silu,
|
||||
}
|
||||
};
|
||||
let vb = VarBuilder::from_tensors(tensors, dtype, &device);
|
||||
let qwen = QwenModel::new(&config, vb).map_err(|e| format!("Malli: {}", e))?;
|
||||
CoderModel::Full(qwen)
|
||||
};
|
||||
|
||||
let model = QwenModel::new(&config, vb).map_err(|e| format!("Malli: {}", e))?;
|
||||
console_log!("[Coder] Malli ladattu ja välimuistitettu");
|
||||
|
||||
MODEL_CACHE.with(|c| {
|
||||
|
||||
@@ -2229,7 +2229,7 @@ Write the corrected code.`;
|
||||
// Mallikatalogista valinta numerolla tai nimellä
|
||||
const loadModels = [
|
||||
{ id: '1', key: '05b', name: 'Qwen2.5-Coder:0.5B', size: '~990 MB', coderSize: '05b' },
|
||||
{ id: '2', key: '3b', name: 'Qwen2.5-Coder:3B', size: '~6.2 GB', coderSize: '3b' },
|
||||
{ id: '2', key: '3b', name: 'Qwen2.5-Coder:3B Q4', size: '~1.9 GB', coderSize: '3b' },
|
||||
];
|
||||
if (!arg) {
|
||||
// Näytetään lista
|
||||
@@ -2268,7 +2268,7 @@ Write the corrected code.`;
|
||||
if (sub === 'models') {
|
||||
termLog(' Käytettävissä olevat mallit:', '#c9d1d9');
|
||||
termLog(' <span style="color:#58a6ff">1</span> qwen-coder Qwen2.5-Coder:0.5B <span style="color:#8b949e">~990 MB | koodin generointi</span>');
|
||||
termLog(' <span style="color:#58a6ff">2</span> qwen-coder-3b Qwen2.5-Coder:3B <span style="color:#8b949e">~6.2 GB | parempi koodinlaatu</span>');
|
||||
termLog(' <span style="color:#58a6ff">2</span> qwen-coder-3b Qwen2.5-Coder:3B Q4 <span style="color:#8b949e">~1.9 GB | kvantisoidtu, parempi laatu</span>');
|
||||
termLog(' <span style="color:#58a6ff">3</span> smollm-135m SmolLM 135M <span style="color:#8b949e">~270 MB | kevyt, nopea</span>');
|
||||
termLog(' <span style="color:#58a6ff">4</span> qwen-05b Qwen2.5:0.5B <span style="color:#8b949e">~990 MB | yleismalli</span>');
|
||||
termLog(' <span style="color:#58a6ff">5</span> phi3-mini Phi-3 Mini <span style="color:#8b949e">~2.2 GB | Microsoftin malli</span>');
|
||||
|
||||
Reference in New Issue
Block a user