candle_transformers/
utils.rs

1//! Apply penalty and repeat_kv
2
3use candle::{Result, Tensor};
4
5pub fn apply_repeat_penalty(logits: &Tensor, penalty: f32, context: &[u32]) -> Result<Tensor> {
6    let device = logits.device();
7    let mut logits = logits.to_dtype(candle::DType::F32)?.to_vec1::<f32>()?;
8    let mut already_seen = std::collections::HashSet::new();
9    for token_id in context {
10        if already_seen.contains(token_id) {
11            continue;
12        }
13        already_seen.insert(token_id);
14        if let Some(logit) = logits.get_mut(*token_id as usize) {
15            if *logit >= 0. {
16                *logit /= penalty
17            } else {
18                *logit *= penalty
19            }
20        }
21    }
22    let logits_len = logits.len();
23    Tensor::from_vec(logits, logits_len, device)
24}
25
26/// Repeats a key or value tensor for grouped query attention
27/// The input tensor should have a shape `(batch, num_kv_heads, seq_len, head_dim)`,
28pub fn repeat_kv(xs: Tensor, n_rep: usize) -> Result<Tensor> {
29    if n_rep == 1 {
30        Ok(xs)
31    } else {
32        let (b_sz, n_kv_head, seq_len, head_dim) = xs.dims4()?;
33        // Using cat is faster than a broadcast as it avoids going through a potentially
34        // strided copy.
35        // https://github.com/huggingface/candle/pull/2043
36        Tensor::cat(&vec![&xs; n_rep], 2)?.reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim))
37    }
38}