candle_transformers/models/chinese_clip/
mod.rs

1//! Chinese contrastive Language-Image Pre-Training
2//!
3//! Chinese contrastive Language-Image Pre-Training (CLIP) is an architecture trained on
4//! pairs of images with related texts.
5//!
6//! - 💻 [GH Link](https://github.com/OFA-Sys/Chinese-CLIP)
7//! - 💻 Transformers Python [reference implementation](https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py)
8//!
9use candle::{Module, Result, Tensor, D};
10use candle_nn as nn;
11
12use text_model::ChineseClipTextTransformer;
13use vision_model::ChineseClipVisionTransformer;
14
15pub mod text_model;
16pub mod vision_model;
17
18#[derive(Debug, Clone, Copy)]
19pub enum Activation {
20    QuickGelu,
21    Gelu,
22    GeluNew,
23    Relu,
24}
25
26impl From<String> for Activation {
27    fn from(value: String) -> Self {
28        match value.as_str() {
29            "quick_gelu" => Activation::QuickGelu,
30            "gelu" => Activation::Gelu,
31            "gelu_new" => Activation::GeluNew,
32            "relu" => Activation::Relu,
33            _ => panic!("Invalid activation function: {}", value),
34        }
35    }
36}
37
38impl Module for Activation {
39    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
40        match self {
41            Activation::QuickGelu => xs * nn::ops::sigmoid(&(xs * 1.702f64)?)?,
42            Activation::Gelu => xs.gelu_erf(),
43            Activation::GeluNew => xs.gelu(),
44            Activation::Relu => xs.relu(),
45        }
46    }
47}
48
49#[derive(Clone, Debug)]
50pub struct ChineseClipConfig {
51    pub text_config: text_model::ChineseClipTextConfig,
52    pub vision_config: vision_model::ChineseClipVisionConfig,
53    pub projection_dim: usize,
54    pub logit_scale_init_value: f32,
55    pub image_size: usize,
56}
57
58impl ChineseClipConfig {
59    /// referer: https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/blob/main/config.json
60    pub fn clip_vit_base_patch16() -> Self {
61        let text_config = text_model::ChineseClipTextConfig::clip_vit_base_patch16();
62        let vision_config = vision_model::ChineseClipVisionConfig::clip_vit_base_patch16();
63
64        Self {
65            text_config,
66            vision_config,
67            projection_dim: 512,
68            logit_scale_init_value: 2.6592,
69            image_size: 512,
70        }
71    }
72}
73
74#[derive(Clone, Debug)]
75pub enum EncoderConfig {
76    Text(text_model::ChineseClipTextConfig),
77    Vision(vision_model::ChineseClipVisionConfig),
78}
79
80impl EncoderConfig {
81    pub fn embed_dim(&self) -> usize {
82        match self {
83            Self::Text(c) => c.hidden_size,
84            Self::Vision(c) => c.hidden_size,
85        }
86    }
87
88    pub fn num_attention_heads(&self) -> usize {
89        match self {
90            Self::Text(c) => c.num_attention_heads,
91            Self::Vision(c) => c.num_attention_heads,
92        }
93    }
94
95    pub fn intermediate_size(&self) -> usize {
96        match self {
97            Self::Text(c) => c.intermediate_size,
98            Self::Vision(c) => c.intermediate_size,
99        }
100    }
101
102    pub fn num_hidden_layers(&self) -> usize {
103        match self {
104            Self::Text(c) => c.num_hidden_layers,
105            Self::Vision(c) => c.num_hidden_layers,
106        }
107    }
108
109    pub fn activation(&self) -> Activation {
110        match self {
111            Self::Text(c) => c.hidden_act,
112            Self::Vision(c) => c.hidden_act,
113        }
114    }
115
116    pub fn layer_norm_eps(&self) -> f64 {
117        match self {
118            Self::Text(c) => c.layer_norm_eps,
119            Self::Vision(c) => c.layer_norm_eps,
120        }
121    }
122}
123
124#[derive(Clone, Debug)]
125pub struct ChineseClipModel {
126    text_model: ChineseClipTextTransformer,
127    vision_model: ChineseClipVisionTransformer,
128    visual_projection: nn::Linear,
129    text_projection: nn::Linear,
130    logit_scale: Tensor,
131}
132
133impl ChineseClipModel {
134    pub fn new(vs: nn::VarBuilder, c: &ChineseClipConfig) -> Result<Self> {
135        let text_model = ChineseClipTextTransformer::new(vs.pp("text_model"), &c.text_config)?;
136
137        let vision_model =
138            ChineseClipVisionTransformer::new(vs.pp("vision_model"), &c.vision_config)?;
139
140        let vision_embed_dim = c.vision_config.hidden_size;
141        let vision_projection = nn::linear_no_bias(
142            vision_embed_dim,
143            c.projection_dim,
144            vs.pp("visual_projection"),
145        )?;
146
147        let text_embed_dim = c.text_config.hidden_size;
148        let text_projection =
149            nn::linear_no_bias(text_embed_dim, c.projection_dim, vs.pp("text_projection"))?;
150
151        let logit_scale = if vs.contains_tensor("logit_scale") {
152            vs.get(&[], "logit_scale")?
153        } else {
154            Tensor::new(&[c.logit_scale_init_value], vs.device())?
155        };
156
157        Ok(Self {
158            text_model,
159            vision_model,
160            visual_projection: vision_projection,
161            text_projection,
162            logit_scale,
163        })
164    }
165
166    pub fn get_text_features(
167        &self,
168        input_ids: &Tensor,
169        token_type_ids: Option<&Tensor>,
170        attention_mask: Option<&Tensor>,
171    ) -> Result<Tensor> {
172        let output = self
173            .text_model
174            .forward(input_ids, token_type_ids, attention_mask)?
175            .contiguous()?;
176        self.text_projection.forward(&output)
177    }
178
179    pub fn get_image_features(&self, pixel_values: &Tensor) -> Result<Tensor> {
180        pixel_values
181            .apply(&self.vision_model)?
182            .apply(&self.visual_projection)
183    }
184
185    pub fn forward(
186        &self,
187        pixel_values: &Tensor,
188        input_ids: &Tensor,
189        token_type_ids: Option<&Tensor>,
190        attention_mask: Option<&Tensor>,
191    ) -> Result<(Tensor, Tensor)> {
192        let image_features = self.get_image_features(pixel_values)?;
193        let text_features = self.get_text_features(input_ids, token_type_ids, attention_mask)?;
194
195        let image_features_normalized = div_l2_norm(&image_features)?;
196        let text_features_normalized = div_l2_norm(&text_features)?;
197
198        let logits_per_text = text_features_normalized.matmul(&image_features_normalized.t()?)?;
199        let logit_scale = self.logit_scale.exp()?;
200        let logits_per_text = logits_per_text.broadcast_mul(&logit_scale)?;
201        let logits_per_image = logits_per_text.t()?;
202        Ok((logits_per_text, logits_per_image))
203    }
204}
205
206pub fn div_l2_norm(v: &Tensor) -> Result<Tensor> {
207    let l2_norm = v.sqr()?.sum_keepdim(D::Minus1)?.sqrt()?;
208    v.broadcast_div(&l2_norm)
209}