candle_transformers/models/llava/
utils.rs

1pub fn get_anyres_image_grid_shape(
2    image_size: (u32, u32),
3    grid_pinpoints: &[(u32, u32)],
4    patch_size: u32,
5) -> (u32, u32) {
6    let (width, height) = select_best_resolution(image_size, grid_pinpoints);
7    (width / patch_size, height / patch_size)
8}
9
10pub fn select_best_resolution(
11    original_size: (u32, u32),
12    possible_resolutions: &[(u32, u32)],
13) -> (u32, u32) {
14    let (original_width, original_height) = original_size;
15    let mut best_fit = (0, 0);
16    let original_width_f = original_width as f32;
17    let original_height_f = original_height as f32;
18    let mut max_effective_resolution = 0_u32;
19    let mut min_wasted_resolution = u32::MAX;
20    for (width, height) in possible_resolutions {
21        let width_f = *width as f32;
22        let height_f = *height as f32;
23        let scale = (width_f / original_width_f).min(height_f / original_height_f);
24        let (downscaled_width, downscaled_height) = (
25            (original_width_f * scale) as u32,
26            (original_height_f * scale) as u32,
27        );
28        let effective_resolution =
29            std::cmp::min((*width) * (*height), downscaled_width * downscaled_height);
30        let wasted_resolution = (*width) * (*height) - effective_resolution;
31        if effective_resolution > max_effective_resolution
32            || (effective_resolution == max_effective_resolution
33                && wasted_resolution < min_wasted_resolution)
34        {
35            best_fit = (*width, *height);
36            max_effective_resolution = effective_resolution;
37            min_wasted_resolution = wasted_resolution;
38        }
39    }
40    best_fit
41}