Ollama-integraatio: GPU-inferenssi NVIDIA/AMD/Apple, ei Candle-rajoitteita

- docker-compose: Ollama-container GPU:lla + persistent volume malleille
- native-node: Candle poistettu, kutsuu Ollaman HTTP API:a (async)
- Dockerfile: yksinkertaistettu, ei CUDA SDK:ta (Ollama hoitaa GPU:n)
- Tukee kaikkia malleja: qwen2.5-coder:1.5b/3b/7b/14b/32b
- OLLAMA_MODEL ympäristömuuttujalla vaihdetaan malli
- kpn models näyttää Ollama-mallit nopeustiedoilla

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-07 06:22:11 +03:00
parent d8443792a3
commit 3eb0c4d939
6 changed files with 126 additions and 290 deletions

View File

@@ -1,13 +1,9 @@
FROM nvidia/cuda:12.6.3-devel-ubuntu24.04 AS builder
FROM rust:slim AS builder
RUN apt-get update && apt-get install -y \
curl pkg-config libssl-dev g++ libvulkan-dev \
pkg-config libssl-dev g++ libvulkan-dev \
&& rm -rf /var/lib/apt/lists/*
# Rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /app
COPY Cargo.toml Cargo.lock ./
COPY hub/Cargo.toml hub/Cargo.toml
@@ -24,15 +20,15 @@ RUN mkdir -p hub/src node/src native-node/src cli/src \
&& cargo build --release -p native-node 2>/dev/null || true
COPY native-node/src native-node/src
# RTX 4090 = sm_89, RTX 3090 = sm_86, RTX 2080 = sm_75
ENV CUDA_COMPUTE_CAP=89
RUN cargo build --release -p native-node
FROM nvidia/cuda:12.6.3-runtime-ubuntu24.04
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y ca-certificates libvulkan1 && rm -rf /var/lib/apt/lists/*
COPY --from=builder /app/target/release/native-node /usr/local/bin/native-node
ENV HUB_URL=ws://agentic-poc:3000/ws
ENV OLLAMA_URL=http://ollama:11434
ENV OLLAMA_MODEL=qwen2.5-coder:7b
ENV ALLOCATED_GB=4
CMD ["native-node"]

View File

@@ -11,19 +11,14 @@ services:
# Käännetään aina käynnistyksen yhteydessä varmuuden vuoksi Wasm uusimmista koodeista, ja päälle pyöräytetään Hub!
command: bash -c "cd node && wasm-pack build --release --target web --out-dir ../static/pkg && cd ../hub && cargo run"
# Valinnainen natiivi-solmu — kerää oikeat laitteistotiedot (nvidia-smi-taso)
native-node:
build:
context: .
dockerfile: Dockerfile.native-node
container_name: kipina_native_node
runtime: nvidia
environment:
- HUB_URL=ws://agentic-poc:3000/ws
- ALLOCATED_GB=4
- NVIDIA_VISIBLE_DEVICES=all
depends_on:
- agentic-poc
# Ollama — LLM-inferenssi GPU:lla (NVIDIA/AMD/Apple)
ollama:
image: ollama/ollama:latest
container_name: kipina_ollama
ports:
- "11434:11434"
volumes:
- ollama-models:/root/.ollama
deploy:
resources:
reservations:
@@ -33,3 +28,23 @@ services:
capabilities: [gpu]
profiles:
- native
# Natiivisolmu — yhdistää hubiin ja käyttää Ollamaa inferenssiin
native-node:
build:
context: .
dockerfile: Dockerfile.native-node
container_name: kipina_native_node
environment:
- HUB_URL=ws://agentic-poc:3000/ws
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=qwen2.5-coder:7b
- ALLOCATED_GB=4
depends_on:
- agentic-poc
- ollama
profiles:
- native
volumes:
ollama-models:

View File

@@ -1,6 +1,6 @@
[package]
name = "native-node"
version = "0.1.0"
version = "0.2.0"
edition = "2024"
[dependencies]
@@ -12,10 +12,6 @@ serde_json = "1.0"
sysinfo = "0.30"
nvml-wrapper = "0.10"
wgpu = "24"
candle-core = { version = "0.8", features = ["cuda"] }
candle-nn = "0.8"
candle-transformers = "0.8"
hf-hub = "0.4"
tokenizers = "0.19"
reqwest = { version = "0.12", features = ["json"] }
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }

View File

@@ -1,263 +1,107 @@
use candle_core::{Device, Tensor, DType};
use candle_nn::VarBuilder;
use candle_transformers::models::qwen2::{Config as QwenConfig, ModelForCausalLM as QwenModel};
use hf_hub::{api::sync::Api, Repo, RepoType};
use std::time::Instant;
/// Top-k sampling with temperature and repetition penalty
fn sample_top_k(logits: &Tensor, k: usize, temperature: f64, generated_tokens: &[u32], repetition_penalty: f64, rng_state: &mut u64) -> Result<u32, String> {
let mut logits_vec: Vec<f32> = logits.to_vec1::<f32>().map_err(|e| format!("to_vec1: {}", e))?;
if logits_vec.is_empty() { return Err("Tyhjä logits".to_string()); }
// Repetition penalty: rankaisee jo generoituja tokeneita
for &token_id in generated_tokens {
if (token_id as usize) < logits_vec.len() {
let logit = &mut logits_vec[token_id as usize];
if *logit > 0.0 {
*logit /= repetition_penalty as f32;
} else {
*logit *= repetition_penalty as f32;
}
}
}
// Temperature scaling
if temperature > 0.0 && temperature != 1.0 {
for logit in logits_vec.iter_mut() {
*logit /= temperature as f32;
}
}
// Top-k: etsitään k suurinta
let mut indexed: Vec<(usize, f32)> = logits_vec.iter().enumerate().map(|(i, &v)| (i, v)).collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
indexed.truncate(k);
if k == 1 || temperature == 0.0 {
return Ok(indexed[0].0 as u32);
}
// Softmax top-k:lle
let max_logit = indexed[0].1;
let exps: Vec<f32> = indexed.iter().map(|x| (x.1 - max_logit).exp()).collect();
let sum: f32 = exps.iter().sum();
let probs: Vec<f32> = exps.iter().map(|e| e / sum).collect();
// XorShift64 RNG
*rng_state ^= *rng_state << 13;
*rng_state ^= *rng_state >> 7;
*rng_state ^= *rng_state << 17;
let rand_val = (*rng_state % 10000) as f32 / 10000.0;
let mut cumulative = 0.0;
for (i, p) in probs.iter().enumerate() {
cumulative += p;
if rand_val < cumulative {
return Ok(indexed[i].0 as u32);
}
}
Ok(indexed[0].0 as u32)
}
pub struct LlmEngine {
tokenizer: tokenizers::Tokenizer,
model: QwenModel,
device: Device,
eos_token: u32,
ollama_url: String,
model: String,
client: reqwest::Client,
}
impl LlmEngine {
pub fn load() -> Result<Self, String> {
// Candle 0.8: RMS-norm ei tue CUDA:a → käytetään CPU:ta
// Natiivi CPU on silti ~10-20× nopeampi kuin WASM (multi-threaded, ei browser overhead)
let device = Device::Cpu;
let device_name = "CPU (native)";
tracing::info!("LLM device: {}", device_name);
let ollama_url = std::env::var("OLLAMA_URL").unwrap_or_else(|_| "http://localhost:11434".to_string());
let model = std::env::var("OLLAMA_MODEL").unwrap_or_else(|_| "qwen2.5-coder:7b".to_string());
let dtype = DType::F32;
tracing::info!("Ollama backend: {} | malli: {}", ollama_url, model);
tracing::info!("Ladataan Qwen2.5-Coder-0.5B-Instruct...");
let api = Api::new().map_err(|e| format!("HF API: {}", e))?;
let repo = api.repo(Repo::with_revision(
"Qwen/Qwen2.5-Coder-0.5B-Instruct".to_string(),
RepoType::Model,
"main".to_string(),
));
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(600))
.build()
.map_err(|e| format!("HTTP client: {}", e))?;
let tokenizer_path = repo.get("tokenizer.json").map_err(|e| format!("Tokenizer lataus: {}", e))?;
let model_path = repo.get("model.safetensors").map_err(|e| format!("Malli lataus: {}", e))?;
tracing::info!("Ladataan tokenizer: {:?}", tokenizer_path);
let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
.map_err(|e| format!("Tokenizer: {}", e))?;
let config = QwenConfig {
vocab_size: 151936,
hidden_size: 896,
intermediate_size: 4864,
num_hidden_layers: 24,
num_attention_heads: 14,
num_key_value_heads: 2,
max_position_embeddings: 32768,
sliding_window: 32768,
max_window_layers: 21,
tie_word_embeddings: true,
rope_theta: 1000000.0,
rms_norm_eps: 1e-6,
use_sliding_window: false,
hidden_act: candle_nn::Activation::Silu,
};
let start = Instant::now();
let vb = unsafe {
VarBuilder::from_mmaped_safetensors(&[model_path.clone()], dtype, &device)
.map_err(|e| format!("VarBuilder: {}", e))?
};
let model = QwenModel::new(&config, vb).map_err(|e| format!("Malli: {}", e))?;
tracing::info!("Malli ladattu ({:.1}s) — {}", start.elapsed().as_secs_f64(), device_name);
Ok(LlmEngine {
tokenizer,
model,
device,
eos_token: 151645,
})
Ok(LlmEngine { ollama_url, model, client })
}
pub fn generate(&mut self, prompt: &str, max_tokens: usize) -> Result<GenerateResult, String> {
// Prefill: aloitetaan vastaus ```-koodiblokkilla → malli jatkaa suoraan koodilla
let formatted = format!("<|im_start|>system\nYou are a coding assistant. Respond with ONLY code. No explanations, no markdown, no comments unless asked.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n```\n", prompt);
pub fn model_name(&self) -> &str {
&self.model
}
let encoding = self.tokenizer.encode(formatted.as_str(), true)
.map_err(|e| format!("Encode: {}", e))?;
let input_ids: Vec<u32> = encoding.get_ids().to_vec();
let input_len = input_ids.len();
/// Varmistaa että malli on ladattu Ollamaan (ollama pull)
pub async fn ensure_model(&self) -> Result<(), String> {
tracing::info!("Tarkistetaan malli {}...", self.model);
let resp = self.client.post(format!("{}/api/pull", self.ollama_url))
.json(&serde_json::json!({ "name": self.model, "stream": false }))
.send()
.await
.map_err(|e| format!("Ollama pull: {}", e))?;
// Nollataan KV-cache edellisestä promptista
self.model.clear_kv_cache();
if resp.status().is_success() {
tracing::info!("Malli {} valmis", self.model);
Ok(())
} else {
Err(format!("Ollama pull epäonnistui: {}", resp.status()))
}
}
// Sampling-parametrit
let temperature = 0.7;
let top_k = 40;
let repetition_penalty = 1.15;
let mut rng_state: u64 = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos() as u64;
pub async fn generate(&self, prompt: &str, max_tokens: usize) -> Result<GenerateResult, String> {
let system = "You are a coding assistant. Respond with ONLY code. No explanations, no markdown, no comments unless asked.";
let start = Instant::now();
// Prefill
let input = Tensor::new(input_ids.as_slice(), &self.device)
.and_then(|t| t.unsqueeze(0))
.map_err(|e| format!("Tensor: {}", e))?;
let logits = self.model.forward(&input, 0)
.map_err(|e| format!("Forward prefill: {}", e))?;
let logits = logits.squeeze(0).map_err(|e| format!("Squeeze: {}", e))?;
let logits = if logits.dims().len() == 2 {
let seq_len = logits.dim(0).map_err(|e| format!("Dim: {}", e))?;
if seq_len == 0 { return Err("Tyhjä tensori".to_string()); }
logits.get(seq_len - 1).map_err(|e| format!("Get: {}", e))?
} else {
logits
};
let mut generated_text = String::new();
let mut tokens_generated: usize = 0;
let mut all_tokens: Vec<u32> = Vec::new();
let mut next_token = sample_top_k(&logits, top_k, temperature, &all_tokens, repetition_penalty, &mut rng_state)?;
if next_token != self.eos_token {
if let Ok(text) = self.tokenizer.decode(&[next_token], true) {
generated_text.push_str(&text);
}
all_tokens.push(next_token);
tokens_generated += 1;
}
// Autoregressive
let mut pos = input_len;
for _ in 1..max_tokens {
if next_token == self.eos_token { break; }
let input = Tensor::new(&[next_token], &self.device)
.and_then(|t| t.unsqueeze(0))
.map_err(|e| format!("Tensor: {}", e))?;
let logits = self.model.forward(&input, pos)
.map_err(|e| format!("Forward pos {}: {}", pos, e))?;
let logits = logits.squeeze(0).map_err(|e| format!("Squeeze: {}", e))?;
let logits = if logits.dims().len() == 2 {
let seq_len = logits.dim(0).map_err(|e| format!("Dim: {}", e))?;
if seq_len == 0 { break; }
logits.get(seq_len - 1).map_err(|e| format!("Get: {}", e))?
} else {
logits
};
next_token = sample_top_k(&logits, top_k, temperature, &all_tokens, repetition_penalty, &mut rng_state)?;
pos += 1;
if next_token == self.eos_token { break; }
if let Ok(text) = self.tokenizer.decode(&[next_token], true) {
generated_text.push_str(&text);
// Stop-sekvenssit: katkaistaan kun malli alkaa selittää
let lower = generated_text.to_lowercase();
if lower.contains("\n###") || lower.contains("\nexplanation") || lower.contains("\nnote:") || lower.contains("\noutput:") || lower.contains("\n```\n\n") || lower.contains("\n// example") || lower.contains("\n# example") {
for stop in &["\n###", "\nExplanation", "\nNote:", "\nOutput:", "\n```\n\n", "\n// Example", "\n// example", "\n# Example", "\n# example"] {
if let Some(pos) = generated_text.find(stop) {
generated_text.truncate(pos);
}
}
break;
let resp = self.client.post(format!("{}/api/generate", self.ollama_url))
.json(&serde_json::json!({
"model": self.model,
"prompt": prompt,
"system": system,
"stream": false,
"options": {
"num_predict": max_tokens,
"temperature": 0.7,
"top_k": 40,
"repeat_penalty": 1.15,
"stop": ["<|im_end|>", "\n###", "\nExplanation", "\nNote:"]
}
}
all_tokens.push(next_token);
tokens_generated += 1;
}))
.send()
.await
.map_err(|e| format!("Ollama generate: {}", e))?;
if !resp.status().is_success() {
return Err(format!("Ollama HTTP {}", resp.status()));
}
let gen_time = start.elapsed();
let tokens_per_sec = if gen_time.as_secs_f64() > 0.0 {
tokens_generated as f64 / gen_time.as_secs_f64()
let body: serde_json::Value = resp.json().await
.map_err(|e| format!("Ollama JSON: {}", e))?;
let text = body["response"].as_str().unwrap_or("").to_string();
let total_duration_ns = body["total_duration"].as_u64().unwrap_or(0);
let eval_count = body["eval_count"].as_u64().unwrap_or(0) as usize;
let eval_duration_ns = body["eval_duration"].as_u64().unwrap_or(1);
let duration_ms = start.elapsed().as_millis() as f64;
let tokens_per_sec = if eval_duration_ns > 0 {
eval_count as f64 / (eval_duration_ns as f64 / 1_000_000_000.0)
} else { 0.0 };
Ok(GenerateResult {
text: strip_markdown_wrapper(&generated_text),
tokens_generated,
duration_ms: gen_time.as_millis() as f64,
text: strip_code_fences(&text),
tokens_generated: eval_count,
duration_ms,
tokens_per_sec,
})
}
}
const LANG_TAGS: &[&str] = &[
"python", "py", "rust", "rs", "javascript", "js", "typescript", "ts",
"java", "kotlin", "scala", "go", "ruby", "rb", "php", "swift",
"c", "cpp", "c++", "c#", "csharp", "r", "sql", "bash", "sh", "zsh",
"html", "css", "json", "yaml", "yml", "toml", "xml", "markdown", "md",
"lua", "perl", "dart", "elixir", "haskell", "hs", "ocaml", "zig",
"plaintext", "text", "txt",
];
/// Siivoa mallin tuottama vastaus (prefill-yhteensopiva).
fn strip_markdown_wrapper(text: &str) -> String {
/// Siivoa mahdolliset markdown-koodiblokki-merkit
fn strip_code_fences(text: &str) -> String {
let mut result = text.trim().to_string();
// 1. Kielitunniste — VAIN tunnettu kieli
if let Some(nl) = result.find('\n') {
let first = result[..nl].trim().to_lowercase();
if LANG_TAGS.contains(&first.as_str()) {
// Poista aloittava ```lang
if result.starts_with("```") {
if let Some(nl) = result.find('\n') {
result = result[nl + 1..].to_string();
}
}
// 2. Sulkeva ``` — VAIN omalla rivillään lopussa
// Poista sulkeva ```
let trimmed = result.trim_end();
if trimmed.ends_with("```") {
let before = &trimmed[..trimmed.len() - 3];
@@ -266,29 +110,7 @@ fn strip_markdown_wrapper(text: &str) -> String {
}
}
// 3. Johdantolauseet
let lower = result.trim().to_lowercase();
for prefix in &["sure!", "here is", "here's", "certainly!", "below is"] {
if lower.starts_with(prefix) {
if let Some(nl) = result.find('\n') { result = result[nl + 1..].to_string(); }
break;
}
}
// 4. Selityskommentit alusta
let mut lines: Vec<&str> = result.trim().lines().collect();
while !lines.is_empty() {
let first = lines[0].trim();
let is_preamble = first.starts_with("# ") && !first.starts_with("#!")
&& (first.to_lowercase().contains("this is")
|| first.to_lowercase().contains("simple")
|| first.to_lowercase().contains("program that")
|| first.to_lowercase().contains("here is")
|| first.to_lowercase().contains("the following")
|| first.to_lowercase().contains("below"));
if is_preamble { lines.remove(0); } else { break; }
}
lines.join("\n").trim().to_string()
result
}
pub struct GenerateResult {

View File

@@ -285,15 +285,19 @@ async fn main() {
}
}
// Ladataan LLM-malli
tracing::info!("Ladataan LLM-mallia...");
let mut llm = match inference::LlmEngine::load() {
// Ollama-backend
tracing::info!("Alustetaan Ollama-yhteyttä...");
let llm = match inference::LlmEngine::load() {
Ok(engine) => {
tracing::info!("LLM valmis inferenssiin!");
// Varmistetaan malli (ollama pull) — odotetaan kunnes valmis
match engine.ensure_model().await {
Ok(()) => tracing::info!("Ollama valmis inferenssiin!"),
Err(e) => tracing::warn!("Mallin lataus: {} — yritetään silti", e),
}
Some(engine)
}
Err(e) => {
tracing::warn!("LLM-lataus epäonnistui: {} — toimitaan ilman inferenssiä", e);
tracing::warn!("Ollama-alustus epäonnistui: {} — toimitaan ilman inferenssiä", e);
None
}
};
@@ -324,12 +328,13 @@ async fn main() {
if !prompt.is_empty() && msg_model.starts_with("qwen-coder") {
if let Some(ref mut engine) = llm {
if let Some(ref engine) = llm {
busy = true;
let max_tokens = task.get("max_tokens").and_then(|v| v.as_u64()).unwrap_or(512) as usize;
tracing::info!("Generoidaan (task_id: {}, max_tokens: {}): \"{}\"", task_id, max_tokens, prompt);
tracing::info!("Generoidaan (task_id: {}, max_tokens: {}): \"{}\"", task_id, max_tokens, &prompt[..prompt.len().min(100)]);
match engine.generate(prompt, max_tokens) {
let model_name = engine.model_name();
match engine.generate(prompt, max_tokens).await {
Ok(result) => {
tracing::info!(
"Tulos: {} tokenia | {:.0}ms | {:.1} tok/s | \"{}\"",
@@ -342,7 +347,7 @@ async fn main() {
let done = json!({
"type": "llm_done",
"prompt": prompt,
"model": "Qwen2.5-Coder-0.5B (native/GPU)",
"model": format!("{} (Ollama)", model_name),
"response": result.text,
"tokens_generated": result.tokens_generated,
"duration_ms": result.duration_ms,

View File

@@ -2282,10 +2282,12 @@ Files: ${Object.keys(generatedFiles).join(', ')}`;
if (sub === 'models') {
termLog(' <span style="color:#d29922">Selain (kpn load):</span>', '#c9d1d9');
termLog(' qwen-coder Qwen2.5-Coder:0.5B <span style="color:#8b949e">~990 MB | WASM ~0.4 tok/s</span>');
termLog(' <span style="color:#3fb950">Natiivi (Docker GPU/CPU):</span>', '#c9d1d9');
termLog(' qwen-coder Qwen2.5-Coder:0.5B <span style="color:#8b949e">~990 MB | ~8 tok/s</span>');
termLog(' Käyttö: kpn run coder "&lt;prompti&gt;"', '#8b949e');
termLog(' qwen-coder:0.5b <span style="color:#8b949e">~990 MB | WASM ~0.4 tok/s</span>');
termLog(' <span style="color:#3fb950">Natiivi (Ollama + GPU):</span>', '#c9d1d9');
termLog(' qwen2.5-coder:7b <span style="color:#8b949e">~4.7 GB | NVIDIA ~80 tok/s | AMD ~40 tok/s | Apple ~30 tok/s</span>');
termLog(' qwen2.5-coder:3b <span style="color:#8b949e">~1.9 GB | NVIDIA ~120 tok/s</span>');
termLog(' qwen2.5-coder:1.5b <span style="color:#8b949e">~1 GB | NVIDIA ~150 tok/s</span>');
termLog(' Vaihda malli: <span style="color:#58a6ff">OLLAMA_MODEL=qwen2.5-coder:7b</span>', '#8b949e');
termLog(' Hub reitittää automaattisesti nopeimmalle solmulle', '#8b949e');
return;
}