diff --git a/network-poc/Dockerfile.native-node b/network-poc/Dockerfile.native-node index 2e53736..6d4d2eb 100644 --- a/network-poc/Dockerfile.native-node +++ b/network-poc/Dockerfile.native-node @@ -1,13 +1,9 @@ -FROM nvidia/cuda:12.6.3-devel-ubuntu24.04 AS builder +FROM rust:slim AS builder RUN apt-get update && apt-get install -y \ - curl pkg-config libssl-dev g++ libvulkan-dev \ + pkg-config libssl-dev g++ libvulkan-dev \ && rm -rf /var/lib/apt/lists/* -# Rust -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -ENV PATH="/root/.cargo/bin:${PATH}" - WORKDIR /app COPY Cargo.toml Cargo.lock ./ COPY hub/Cargo.toml hub/Cargo.toml @@ -24,15 +20,15 @@ RUN mkdir -p hub/src node/src native-node/src cli/src \ && cargo build --release -p native-node 2>/dev/null || true COPY native-node/src native-node/src -# RTX 4090 = sm_89, RTX 3090 = sm_86, RTX 2080 = sm_75 -ENV CUDA_COMPUTE_CAP=89 RUN cargo build --release -p native-node -FROM nvidia/cuda:12.6.3-runtime-ubuntu24.04 +FROM debian:bookworm-slim RUN apt-get update && apt-get install -y ca-certificates libvulkan1 && rm -rf /var/lib/apt/lists/* COPY --from=builder /app/target/release/native-node /usr/local/bin/native-node ENV HUB_URL=ws://agentic-poc:3000/ws +ENV OLLAMA_URL=http://ollama:11434 +ENV OLLAMA_MODEL=qwen2.5-coder:7b ENV ALLOCATED_GB=4 CMD ["native-node"] diff --git a/network-poc/docker-compose.yml b/network-poc/docker-compose.yml index a7529d9..5931b46 100644 --- a/network-poc/docker-compose.yml +++ b/network-poc/docker-compose.yml @@ -11,19 +11,14 @@ services: # Käännetään aina käynnistyksen yhteydessä varmuuden vuoksi Wasm uusimmista koodeista, ja päälle pyöräytetään Hub! command: bash -c "cd node && wasm-pack build --release --target web --out-dir ../static/pkg && cd ../hub && cargo run" - # Valinnainen natiivi-solmu — kerää oikeat laitteistotiedot (nvidia-smi-taso) - native-node: - build: - context: . - dockerfile: Dockerfile.native-node - container_name: kipina_native_node - runtime: nvidia - environment: - - HUB_URL=ws://agentic-poc:3000/ws - - ALLOCATED_GB=4 - - NVIDIA_VISIBLE_DEVICES=all - depends_on: - - agentic-poc + # Ollama — LLM-inferenssi GPU:lla (NVIDIA/AMD/Apple) + ollama: + image: ollama/ollama:latest + container_name: kipina_ollama + ports: + - "11434:11434" + volumes: + - ollama-models:/root/.ollama deploy: resources: reservations: @@ -33,3 +28,23 @@ services: capabilities: [gpu] profiles: - native + + # Natiivisolmu — yhdistää hubiin ja käyttää Ollamaa inferenssiin + native-node: + build: + context: . + dockerfile: Dockerfile.native-node + container_name: kipina_native_node + environment: + - HUB_URL=ws://agentic-poc:3000/ws + - OLLAMA_URL=http://ollama:11434 + - OLLAMA_MODEL=qwen2.5-coder:7b + - ALLOCATED_GB=4 + depends_on: + - agentic-poc + - ollama + profiles: + - native + +volumes: + ollama-models: diff --git a/network-poc/native-node/Cargo.toml b/network-poc/native-node/Cargo.toml index 90896b0..c44b879 100644 --- a/network-poc/native-node/Cargo.toml +++ b/network-poc/native-node/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "native-node" -version = "0.1.0" +version = "0.2.0" edition = "2024" [dependencies] @@ -12,10 +12,6 @@ serde_json = "1.0" sysinfo = "0.30" nvml-wrapper = "0.10" wgpu = "24" -candle-core = { version = "0.8", features = ["cuda"] } -candle-nn = "0.8" -candle-transformers = "0.8" -hf-hub = "0.4" -tokenizers = "0.19" +reqwest = { version = "0.12", features = ["json"] } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/network-poc/native-node/src/inference.rs b/network-poc/native-node/src/inference.rs index 888debd..9018a26 100644 --- a/network-poc/native-node/src/inference.rs +++ b/network-poc/native-node/src/inference.rs @@ -1,263 +1,107 @@ -use candle_core::{Device, Tensor, DType}; -use candle_nn::VarBuilder; -use candle_transformers::models::qwen2::{Config as QwenConfig, ModelForCausalLM as QwenModel}; -use hf_hub::{api::sync::Api, Repo, RepoType}; use std::time::Instant; -/// Top-k sampling with temperature and repetition penalty -fn sample_top_k(logits: &Tensor, k: usize, temperature: f64, generated_tokens: &[u32], repetition_penalty: f64, rng_state: &mut u64) -> Result { - let mut logits_vec: Vec = logits.to_vec1::().map_err(|e| format!("to_vec1: {}", e))?; - if logits_vec.is_empty() { return Err("Tyhjä logits".to_string()); } - - // Repetition penalty: rankaisee jo generoituja tokeneita - for &token_id in generated_tokens { - if (token_id as usize) < logits_vec.len() { - let logit = &mut logits_vec[token_id as usize]; - if *logit > 0.0 { - *logit /= repetition_penalty as f32; - } else { - *logit *= repetition_penalty as f32; - } - } - } - - // Temperature scaling - if temperature > 0.0 && temperature != 1.0 { - for logit in logits_vec.iter_mut() { - *logit /= temperature as f32; - } - } - - // Top-k: etsitään k suurinta - let mut indexed: Vec<(usize, f32)> = logits_vec.iter().enumerate().map(|(i, &v)| (i, v)).collect(); - indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); - indexed.truncate(k); - - if k == 1 || temperature == 0.0 { - return Ok(indexed[0].0 as u32); - } - - // Softmax top-k:lle - let max_logit = indexed[0].1; - let exps: Vec = indexed.iter().map(|x| (x.1 - max_logit).exp()).collect(); - let sum: f32 = exps.iter().sum(); - let probs: Vec = exps.iter().map(|e| e / sum).collect(); - - // XorShift64 RNG - *rng_state ^= *rng_state << 13; - *rng_state ^= *rng_state >> 7; - *rng_state ^= *rng_state << 17; - let rand_val = (*rng_state % 10000) as f32 / 10000.0; - - let mut cumulative = 0.0; - for (i, p) in probs.iter().enumerate() { - cumulative += p; - if rand_val < cumulative { - return Ok(indexed[i].0 as u32); - } - } - - Ok(indexed[0].0 as u32) -} - pub struct LlmEngine { - tokenizer: tokenizers::Tokenizer, - model: QwenModel, - device: Device, - eos_token: u32, + ollama_url: String, + model: String, + client: reqwest::Client, } impl LlmEngine { pub fn load() -> Result { - // Candle 0.8: RMS-norm ei tue CUDA:a → käytetään CPU:ta - // Natiivi CPU on silti ~10-20× nopeampi kuin WASM (multi-threaded, ei browser overhead) - let device = Device::Cpu; - let device_name = "CPU (native)"; - tracing::info!("LLM device: {}", device_name); + let ollama_url = std::env::var("OLLAMA_URL").unwrap_or_else(|_| "http://localhost:11434".to_string()); + let model = std::env::var("OLLAMA_MODEL").unwrap_or_else(|_| "qwen2.5-coder:7b".to_string()); - let dtype = DType::F32; + tracing::info!("Ollama backend: {} | malli: {}", ollama_url, model); - tracing::info!("Ladataan Qwen2.5-Coder-0.5B-Instruct..."); - let api = Api::new().map_err(|e| format!("HF API: {}", e))?; - let repo = api.repo(Repo::with_revision( - "Qwen/Qwen2.5-Coder-0.5B-Instruct".to_string(), - RepoType::Model, - "main".to_string(), - )); + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(600)) + .build() + .map_err(|e| format!("HTTP client: {}", e))?; - let tokenizer_path = repo.get("tokenizer.json").map_err(|e| format!("Tokenizer lataus: {}", e))?; - let model_path = repo.get("model.safetensors").map_err(|e| format!("Malli lataus: {}", e))?; - - tracing::info!("Ladataan tokenizer: {:?}", tokenizer_path); - let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path) - .map_err(|e| format!("Tokenizer: {}", e))?; - - let config = QwenConfig { - vocab_size: 151936, - hidden_size: 896, - intermediate_size: 4864, - num_hidden_layers: 24, - num_attention_heads: 14, - num_key_value_heads: 2, - max_position_embeddings: 32768, - sliding_window: 32768, - max_window_layers: 21, - tie_word_embeddings: true, - rope_theta: 1000000.0, - rms_norm_eps: 1e-6, - use_sliding_window: false, - hidden_act: candle_nn::Activation::Silu, - }; - - let start = Instant::now(); - let vb = unsafe { - VarBuilder::from_mmaped_safetensors(&[model_path.clone()], dtype, &device) - .map_err(|e| format!("VarBuilder: {}", e))? - }; - let model = QwenModel::new(&config, vb).map_err(|e| format!("Malli: {}", e))?; - tracing::info!("Malli ladattu ({:.1}s) — {}", start.elapsed().as_secs_f64(), device_name); - - Ok(LlmEngine { - tokenizer, - model, - device, - eos_token: 151645, - }) + Ok(LlmEngine { ollama_url, model, client }) } - pub fn generate(&mut self, prompt: &str, max_tokens: usize) -> Result { - // Prefill: aloitetaan vastaus ```-koodiblokkilla → malli jatkaa suoraan koodilla - let formatted = format!("<|im_start|>system\nYou are a coding assistant. Respond with ONLY code. No explanations, no markdown, no comments unless asked.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n```\n", prompt); + pub fn model_name(&self) -> &str { + &self.model + } - let encoding = self.tokenizer.encode(formatted.as_str(), true) - .map_err(|e| format!("Encode: {}", e))?; - let input_ids: Vec = encoding.get_ids().to_vec(); - let input_len = input_ids.len(); + /// Varmistaa että malli on ladattu Ollamaan (ollama pull) + pub async fn ensure_model(&self) -> Result<(), String> { + tracing::info!("Tarkistetaan malli {}...", self.model); + let resp = self.client.post(format!("{}/api/pull", self.ollama_url)) + .json(&serde_json::json!({ "name": self.model, "stream": false })) + .send() + .await + .map_err(|e| format!("Ollama pull: {}", e))?; - // Nollataan KV-cache edellisestä promptista - self.model.clear_kv_cache(); + if resp.status().is_success() { + tracing::info!("Malli {} valmis", self.model); + Ok(()) + } else { + Err(format!("Ollama pull epäonnistui: {}", resp.status())) + } + } - // Sampling-parametrit - let temperature = 0.7; - let top_k = 40; - let repetition_penalty = 1.15; - let mut rng_state: u64 = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_nanos() as u64; + pub async fn generate(&self, prompt: &str, max_tokens: usize) -> Result { + let system = "You are a coding assistant. Respond with ONLY code. No explanations, no markdown, no comments unless asked."; let start = Instant::now(); - - // Prefill - let input = Tensor::new(input_ids.as_slice(), &self.device) - .and_then(|t| t.unsqueeze(0)) - .map_err(|e| format!("Tensor: {}", e))?; - - let logits = self.model.forward(&input, 0) - .map_err(|e| format!("Forward prefill: {}", e))?; - - let logits = logits.squeeze(0).map_err(|e| format!("Squeeze: {}", e))?; - let logits = if logits.dims().len() == 2 { - let seq_len = logits.dim(0).map_err(|e| format!("Dim: {}", e))?; - if seq_len == 0 { return Err("Tyhjä tensori".to_string()); } - logits.get(seq_len - 1).map_err(|e| format!("Get: {}", e))? - } else { - logits - }; - - let mut generated_text = String::new(); - let mut tokens_generated: usize = 0; - let mut all_tokens: Vec = Vec::new(); - - let mut next_token = sample_top_k(&logits, top_k, temperature, &all_tokens, repetition_penalty, &mut rng_state)?; - - if next_token != self.eos_token { - if let Ok(text) = self.tokenizer.decode(&[next_token], true) { - generated_text.push_str(&text); - } - all_tokens.push(next_token); - tokens_generated += 1; - } - - // Autoregressive - let mut pos = input_len; - for _ in 1..max_tokens { - if next_token == self.eos_token { break; } - - let input = Tensor::new(&[next_token], &self.device) - .and_then(|t| t.unsqueeze(0)) - .map_err(|e| format!("Tensor: {}", e))?; - - let logits = self.model.forward(&input, pos) - .map_err(|e| format!("Forward pos {}: {}", pos, e))?; - - let logits = logits.squeeze(0).map_err(|e| format!("Squeeze: {}", e))?; - let logits = if logits.dims().len() == 2 { - let seq_len = logits.dim(0).map_err(|e| format!("Dim: {}", e))?; - if seq_len == 0 { break; } - logits.get(seq_len - 1).map_err(|e| format!("Get: {}", e))? - } else { - logits - }; - next_token = sample_top_k(&logits, top_k, temperature, &all_tokens, repetition_penalty, &mut rng_state)?; - pos += 1; - - if next_token == self.eos_token { break; } - - if let Ok(text) = self.tokenizer.decode(&[next_token], true) { - generated_text.push_str(&text); - - // Stop-sekvenssit: katkaistaan kun malli alkaa selittää - let lower = generated_text.to_lowercase(); - if lower.contains("\n###") || lower.contains("\nexplanation") || lower.contains("\nnote:") || lower.contains("\noutput:") || lower.contains("\n```\n\n") || lower.contains("\n// example") || lower.contains("\n# example") { - for stop in &["\n###", "\nExplanation", "\nNote:", "\nOutput:", "\n```\n\n", "\n// Example", "\n// example", "\n# Example", "\n# example"] { - if let Some(pos) = generated_text.find(stop) { - generated_text.truncate(pos); - } - } - break; + let resp = self.client.post(format!("{}/api/generate", self.ollama_url)) + .json(&serde_json::json!({ + "model": self.model, + "prompt": prompt, + "system": system, + "stream": false, + "options": { + "num_predict": max_tokens, + "temperature": 0.7, + "top_k": 40, + "repeat_penalty": 1.15, + "stop": ["<|im_end|>", "\n###", "\nExplanation", "\nNote:"] } - } - all_tokens.push(next_token); - tokens_generated += 1; + })) + .send() + .await + .map_err(|e| format!("Ollama generate: {}", e))?; + + if !resp.status().is_success() { + return Err(format!("Ollama HTTP {}", resp.status())); } - let gen_time = start.elapsed(); - let tokens_per_sec = if gen_time.as_secs_f64() > 0.0 { - tokens_generated as f64 / gen_time.as_secs_f64() + let body: serde_json::Value = resp.json().await + .map_err(|e| format!("Ollama JSON: {}", e))?; + + let text = body["response"].as_str().unwrap_or("").to_string(); + let total_duration_ns = body["total_duration"].as_u64().unwrap_or(0); + let eval_count = body["eval_count"].as_u64().unwrap_or(0) as usize; + let eval_duration_ns = body["eval_duration"].as_u64().unwrap_or(1); + + let duration_ms = start.elapsed().as_millis() as f64; + let tokens_per_sec = if eval_duration_ns > 0 { + eval_count as f64 / (eval_duration_ns as f64 / 1_000_000_000.0) } else { 0.0 }; Ok(GenerateResult { - text: strip_markdown_wrapper(&generated_text), - tokens_generated, - duration_ms: gen_time.as_millis() as f64, + text: strip_code_fences(&text), + tokens_generated: eval_count, + duration_ms, tokens_per_sec, }) } } -const LANG_TAGS: &[&str] = &[ - "python", "py", "rust", "rs", "javascript", "js", "typescript", "ts", - "java", "kotlin", "scala", "go", "ruby", "rb", "php", "swift", - "c", "cpp", "c++", "c#", "csharp", "r", "sql", "bash", "sh", "zsh", - "html", "css", "json", "yaml", "yml", "toml", "xml", "markdown", "md", - "lua", "perl", "dart", "elixir", "haskell", "hs", "ocaml", "zig", - "plaintext", "text", "txt", -]; - -/// Siivoa mallin tuottama vastaus (prefill-yhteensopiva). -fn strip_markdown_wrapper(text: &str) -> String { +/// Siivoa mahdolliset markdown-koodiblokki-merkit +fn strip_code_fences(text: &str) -> String { let mut result = text.trim().to_string(); - // 1. Kielitunniste — VAIN tunnettu kieli - if let Some(nl) = result.find('\n') { - let first = result[..nl].trim().to_lowercase(); - if LANG_TAGS.contains(&first.as_str()) { + // Poista aloittava ```lang + if result.starts_with("```") { + if let Some(nl) = result.find('\n') { result = result[nl + 1..].to_string(); } } - // 2. Sulkeva ``` — VAIN omalla rivillään lopussa + // Poista sulkeva ``` let trimmed = result.trim_end(); if trimmed.ends_with("```") { let before = &trimmed[..trimmed.len() - 3]; @@ -266,29 +110,7 @@ fn strip_markdown_wrapper(text: &str) -> String { } } - // 3. Johdantolauseet - let lower = result.trim().to_lowercase(); - for prefix in &["sure!", "here is", "here's", "certainly!", "below is"] { - if lower.starts_with(prefix) { - if let Some(nl) = result.find('\n') { result = result[nl + 1..].to_string(); } - break; - } - } - - // 4. Selityskommentit alusta - let mut lines: Vec<&str> = result.trim().lines().collect(); - while !lines.is_empty() { - let first = lines[0].trim(); - let is_preamble = first.starts_with("# ") && !first.starts_with("#!") - && (first.to_lowercase().contains("this is") - || first.to_lowercase().contains("simple") - || first.to_lowercase().contains("program that") - || first.to_lowercase().contains("here is") - || first.to_lowercase().contains("the following") - || first.to_lowercase().contains("below")); - if is_preamble { lines.remove(0); } else { break; } - } - lines.join("\n").trim().to_string() + result } pub struct GenerateResult { diff --git a/network-poc/native-node/src/main.rs b/network-poc/native-node/src/main.rs index dffbc1f..6ec4de9 100644 --- a/network-poc/native-node/src/main.rs +++ b/network-poc/native-node/src/main.rs @@ -285,15 +285,19 @@ async fn main() { } } - // Ladataan LLM-malli - tracing::info!("Ladataan LLM-mallia..."); - let mut llm = match inference::LlmEngine::load() { + // Ollama-backend + tracing::info!("Alustetaan Ollama-yhteyttä..."); + let llm = match inference::LlmEngine::load() { Ok(engine) => { - tracing::info!("LLM valmis inferenssiin!"); + // Varmistetaan malli (ollama pull) — odotetaan kunnes valmis + match engine.ensure_model().await { + Ok(()) => tracing::info!("Ollama valmis inferenssiin!"), + Err(e) => tracing::warn!("Mallin lataus: {} — yritetään silti", e), + } Some(engine) } Err(e) => { - tracing::warn!("LLM-lataus epäonnistui: {} — toimitaan ilman inferenssiä", e); + tracing::warn!("Ollama-alustus epäonnistui: {} — toimitaan ilman inferenssiä", e); None } }; @@ -324,12 +328,13 @@ async fn main() { if !prompt.is_empty() && msg_model.starts_with("qwen-coder") { - if let Some(ref mut engine) = llm { + if let Some(ref engine) = llm { busy = true; let max_tokens = task.get("max_tokens").and_then(|v| v.as_u64()).unwrap_or(512) as usize; - tracing::info!("Generoidaan (task_id: {}, max_tokens: {}): \"{}\"", task_id, max_tokens, prompt); + tracing::info!("Generoidaan (task_id: {}, max_tokens: {}): \"{}\"", task_id, max_tokens, &prompt[..prompt.len().min(100)]); - match engine.generate(prompt, max_tokens) { + let model_name = engine.model_name(); + match engine.generate(prompt, max_tokens).await { Ok(result) => { tracing::info!( "Tulos: {} tokenia | {:.0}ms | {:.1} tok/s | \"{}\"", @@ -342,7 +347,7 @@ async fn main() { let done = json!({ "type": "llm_done", "prompt": prompt, - "model": "Qwen2.5-Coder-0.5B (native/GPU)", + "model": format!("{} (Ollama)", model_name), "response": result.text, "tokens_generated": result.tokens_generated, "duration_ms": result.duration_ms, diff --git a/network-poc/static/index.html b/network-poc/static/index.html index d370b00..ee83c0d 100644 --- a/network-poc/static/index.html +++ b/network-poc/static/index.html @@ -2282,10 +2282,12 @@ Files: ${Object.keys(generatedFiles).join(', ')}`; if (sub === 'models') { termLog(' Selain (kpn load):', '#c9d1d9'); - termLog(' qwen-coder Qwen2.5-Coder:0.5B ~990 MB | WASM ~0.4 tok/s'); - termLog(' Natiivi (Docker GPU/CPU):', '#c9d1d9'); - termLog(' qwen-coder Qwen2.5-Coder:0.5B ~990 MB | ~8 tok/s'); - termLog(' Käyttö: kpn run coder "<prompti>"', '#8b949e'); + termLog(' qwen-coder:0.5b ~990 MB | WASM ~0.4 tok/s'); + termLog(' Natiivi (Ollama + GPU):', '#c9d1d9'); + termLog(' qwen2.5-coder:7b ~4.7 GB | NVIDIA ~80 tok/s | AMD ~40 tok/s | Apple ~30 tok/s'); + termLog(' qwen2.5-coder:3b ~1.9 GB | NVIDIA ~120 tok/s'); + termLog(' qwen2.5-coder:1.5b ~1 GB | NVIDIA ~150 tok/s'); + termLog(' Vaihda malli: OLLAMA_MODEL=qwen2.5-coder:7b', '#8b949e'); termLog(' Hub reitittää automaattisesti nopeimmalle solmulle', '#8b949e'); return; }