Ollama-integraatio: GPU-inferenssi NVIDIA/AMD/Apple, ei Candle-rajoitteita
- docker-compose: Ollama-container GPU:lla + persistent volume malleille - native-node: Candle poistettu, kutsuu Ollaman HTTP API:a (async) - Dockerfile: yksinkertaistettu, ei CUDA SDK:ta (Ollama hoitaa GPU:n) - Tukee kaikkia malleja: qwen2.5-coder:1.5b/3b/7b/14b/32b - OLLAMA_MODEL ympäristömuuttujalla vaihdetaan malli - kpn models näyttää Ollama-mallit nopeustiedoilla Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -285,15 +285,19 @@ async fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
// Ladataan LLM-malli
|
||||
tracing::info!("Ladataan LLM-mallia...");
|
||||
let mut llm = match inference::LlmEngine::load() {
|
||||
// Ollama-backend
|
||||
tracing::info!("Alustetaan Ollama-yhteyttä...");
|
||||
let llm = match inference::LlmEngine::load() {
|
||||
Ok(engine) => {
|
||||
tracing::info!("LLM valmis inferenssiin!");
|
||||
// Varmistetaan malli (ollama pull) — odotetaan kunnes valmis
|
||||
match engine.ensure_model().await {
|
||||
Ok(()) => tracing::info!("Ollama valmis inferenssiin!"),
|
||||
Err(e) => tracing::warn!("Mallin lataus: {} — yritetään silti", e),
|
||||
}
|
||||
Some(engine)
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("LLM-lataus epäonnistui: {} — toimitaan ilman inferenssiä", e);
|
||||
tracing::warn!("Ollama-alustus epäonnistui: {} — toimitaan ilman inferenssiä", e);
|
||||
None
|
||||
}
|
||||
};
|
||||
@@ -324,12 +328,13 @@ async fn main() {
|
||||
|
||||
if !prompt.is_empty() && msg_model.starts_with("qwen-coder") {
|
||||
|
||||
if let Some(ref mut engine) = llm {
|
||||
if let Some(ref engine) = llm {
|
||||
busy = true;
|
||||
let max_tokens = task.get("max_tokens").and_then(|v| v.as_u64()).unwrap_or(512) as usize;
|
||||
tracing::info!("Generoidaan (task_id: {}, max_tokens: {}): \"{}\"", task_id, max_tokens, prompt);
|
||||
tracing::info!("Generoidaan (task_id: {}, max_tokens: {}): \"{}\"", task_id, max_tokens, &prompt[..prompt.len().min(100)]);
|
||||
|
||||
match engine.generate(prompt, max_tokens) {
|
||||
let model_name = engine.model_name();
|
||||
match engine.generate(prompt, max_tokens).await {
|
||||
Ok(result) => {
|
||||
tracing::info!(
|
||||
"Tulos: {} tokenia | {:.0}ms | {:.1} tok/s | \"{}\"",
|
||||
@@ -342,7 +347,7 @@ async fn main() {
|
||||
let done = json!({
|
||||
"type": "llm_done",
|
||||
"prompt": prompt,
|
||||
"model": "Qwen2.5-Coder-0.5B (native/GPU)",
|
||||
"model": format!("{} (Ollama)", model_name),
|
||||
"response": result.text,
|
||||
"tokens_generated": result.tokens_generated,
|
||||
"duration_ms": result.duration_ms,
|
||||
|
||||
Reference in New Issue
Block a user