From eb6f720fcc1858f5b4b4d2709212fc3f2809a5b5 Mon Sep 17 00:00:00 2001 From: jaakko Date: Tue, 7 Apr 2026 13:18:07 +0300 Subject: [PATCH] Wasm tokenize_js() exportti oppaan live-tokenizeria varten MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lisätty #[wasm_bindgen] tokenize_js(text) → JSON-funktio joka lataa tokenizerin IndexedDB:stä tai HuggingFacesta tarvittaessa. Co-Authored-By: Claude Opus 4.6 (1M context) --- network-poc/node/src/lib.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/network-poc/node/src/lib.rs b/network-poc/node/src/lib.rs index 52505ad..2d01da6 100644 --- a/network-poc/node/src/lib.rs +++ b/network-poc/node/src/lib.rs @@ -118,6 +118,27 @@ async fn run_ai_tensor_inference(difficulty: usize) -> String { format!("PoC {} Matmul ({}x{}) >> {}", backend_name, active_workload_size, active_workload_size, result) } +/// JS-exportti: tokenisoi tekstin ja palauttaa JSON-merkkijonon +/// Tokenizer ladataan IndexedDB:stä (täytyy olla ladattu aiemmin) +#[wasm_bindgen] +pub async fn tokenize_js(text: String) -> Result { + let cached_tok = storage::load_from_idb("tokenizer.json").await.unwrap_or(None); + let Some(bytes) = cached_tok else { + // Yritetään ladata verkosta + let resp = reqwest::get("https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/resolve/main/tokenizer.json").await + .map_err(|e| JsValue::from_str(&format!("Tokenizer-lataus epäonnistui: {}", e)))?; + let bytes = resp.bytes().await + .map_err(|e| JsValue::from_str(&format!("Tokenizer-lataus epäonnistui: {}", e)))?; + let _ = storage::save_to_idb("tokenizer.json", &bytes).await; + let tokenizer = tokenizers::Tokenizer::from_bytes(&bytes) + .map_err(|e| JsValue::from_str(&format!("Tokenizer-parsinta: {}", e)))?; + return Ok(tokenize_text(&tokenizer, &text).to_string()); + }; + let tokenizer = tokenizers::Tokenizer::from_bytes(&bytes) + .map_err(|e| JsValue::from_str(&format!("Tokenizer-parsinta: {}", e)))?; + Ok(tokenize_text(&tokenizer, &text).to_string()) +} + /// Tokenisoi yhden tekstin ja palauttaa metriikat fn tokenize_text(tokenizer: &tokenizers::Tokenizer, text: &str) -> serde_json::Value { let char_count = text.chars().count();