candle_transformers/models/chinese_clip/
mod.rs1use candle::{Module, Result, Tensor, D};
10use candle_nn as nn;
11
12use text_model::ChineseClipTextTransformer;
13use vision_model::ChineseClipVisionTransformer;
14
15pub mod text_model;
16pub mod vision_model;
17
18#[derive(Debug, Clone, Copy)]
19pub enum Activation {
20 QuickGelu,
21 Gelu,
22 GeluNew,
23 Relu,
24}
25
26impl From<String> for Activation {
27 fn from(value: String) -> Self {
28 match value.as_str() {
29 "quick_gelu" => Activation::QuickGelu,
30 "gelu" => Activation::Gelu,
31 "gelu_new" => Activation::GeluNew,
32 "relu" => Activation::Relu,
33 _ => panic!("Invalid activation function: {}", value),
34 }
35 }
36}
37
38impl Module for Activation {
39 fn forward(&self, xs: &Tensor) -> Result<Tensor> {
40 match self {
41 Activation::QuickGelu => xs * nn::ops::sigmoid(&(xs * 1.702f64)?)?,
42 Activation::Gelu => xs.gelu_erf(),
43 Activation::GeluNew => xs.gelu(),
44 Activation::Relu => xs.relu(),
45 }
46 }
47}
48
49#[derive(Clone, Debug)]
50pub struct ChineseClipConfig {
51 pub text_config: text_model::ChineseClipTextConfig,
52 pub vision_config: vision_model::ChineseClipVisionConfig,
53 pub projection_dim: usize,
54 pub logit_scale_init_value: f32,
55 pub image_size: usize,
56}
57
58impl ChineseClipConfig {
59 pub fn clip_vit_base_patch16() -> Self {
61 let text_config = text_model::ChineseClipTextConfig::clip_vit_base_patch16();
62 let vision_config = vision_model::ChineseClipVisionConfig::clip_vit_base_patch16();
63
64 Self {
65 text_config,
66 vision_config,
67 projection_dim: 512,
68 logit_scale_init_value: 2.6592,
69 image_size: 512,
70 }
71 }
72}
73
74#[derive(Clone, Debug)]
75pub enum EncoderConfig {
76 Text(text_model::ChineseClipTextConfig),
77 Vision(vision_model::ChineseClipVisionConfig),
78}
79
80impl EncoderConfig {
81 pub fn embed_dim(&self) -> usize {
82 match self {
83 Self::Text(c) => c.hidden_size,
84 Self::Vision(c) => c.hidden_size,
85 }
86 }
87
88 pub fn num_attention_heads(&self) -> usize {
89 match self {
90 Self::Text(c) => c.num_attention_heads,
91 Self::Vision(c) => c.num_attention_heads,
92 }
93 }
94
95 pub fn intermediate_size(&self) -> usize {
96 match self {
97 Self::Text(c) => c.intermediate_size,
98 Self::Vision(c) => c.intermediate_size,
99 }
100 }
101
102 pub fn num_hidden_layers(&self) -> usize {
103 match self {
104 Self::Text(c) => c.num_hidden_layers,
105 Self::Vision(c) => c.num_hidden_layers,
106 }
107 }
108
109 pub fn activation(&self) -> Activation {
110 match self {
111 Self::Text(c) => c.hidden_act,
112 Self::Vision(c) => c.hidden_act,
113 }
114 }
115
116 pub fn layer_norm_eps(&self) -> f64 {
117 match self {
118 Self::Text(c) => c.layer_norm_eps,
119 Self::Vision(c) => c.layer_norm_eps,
120 }
121 }
122}
123
124#[derive(Clone, Debug)]
125pub struct ChineseClipModel {
126 text_model: ChineseClipTextTransformer,
127 vision_model: ChineseClipVisionTransformer,
128 visual_projection: nn::Linear,
129 text_projection: nn::Linear,
130 logit_scale: Tensor,
131}
132
133impl ChineseClipModel {
134 pub fn new(vs: nn::VarBuilder, c: &ChineseClipConfig) -> Result<Self> {
135 let text_model = ChineseClipTextTransformer::new(vs.pp("text_model"), &c.text_config)?;
136
137 let vision_model =
138 ChineseClipVisionTransformer::new(vs.pp("vision_model"), &c.vision_config)?;
139
140 let vision_embed_dim = c.vision_config.hidden_size;
141 let vision_projection = nn::linear_no_bias(
142 vision_embed_dim,
143 c.projection_dim,
144 vs.pp("visual_projection"),
145 )?;
146
147 let text_embed_dim = c.text_config.hidden_size;
148 let text_projection =
149 nn::linear_no_bias(text_embed_dim, c.projection_dim, vs.pp("text_projection"))?;
150
151 let logit_scale = if vs.contains_tensor("logit_scale") {
152 vs.get(&[], "logit_scale")?
153 } else {
154 Tensor::new(&[c.logit_scale_init_value], vs.device())?
155 };
156
157 Ok(Self {
158 text_model,
159 vision_model,
160 visual_projection: vision_projection,
161 text_projection,
162 logit_scale,
163 })
164 }
165
166 pub fn get_text_features(
167 &self,
168 input_ids: &Tensor,
169 token_type_ids: Option<&Tensor>,
170 attention_mask: Option<&Tensor>,
171 ) -> Result<Tensor> {
172 let output = self
173 .text_model
174 .forward(input_ids, token_type_ids, attention_mask)?
175 .contiguous()?;
176 self.text_projection.forward(&output)
177 }
178
179 pub fn get_image_features(&self, pixel_values: &Tensor) -> Result<Tensor> {
180 pixel_values
181 .apply(&self.vision_model)?
182 .apply(&self.visual_projection)
183 }
184
185 pub fn forward(
186 &self,
187 pixel_values: &Tensor,
188 input_ids: &Tensor,
189 token_type_ids: Option<&Tensor>,
190 attention_mask: Option<&Tensor>,
191 ) -> Result<(Tensor, Tensor)> {
192 let image_features = self.get_image_features(pixel_values)?;
193 let text_features = self.get_text_features(input_ids, token_type_ids, attention_mask)?;
194
195 let image_features_normalized = div_l2_norm(&image_features)?;
196 let text_features_normalized = div_l2_norm(&text_features)?;
197
198 let logits_per_text = text_features_normalized.matmul(&image_features_normalized.t()?)?;
199 let logit_scale = self.logit_scale.exp()?;
200 let logits_per_text = logits_per_text.broadcast_mul(&logit_scale)?;
201 let logits_per_image = logits_per_text.t()?;
202 Ok((logits_per_text, logits_per_image))
203 }
204}
205
206pub fn div_l2_norm(v: &Tensor) -> Result<Tensor> {
207 let l2_norm = v.sqr()?.sum_keepdim(D::Minus1)?.sqrt()?;
208 v.broadcast_div(&l2_norm)
209}