@@ -1743,7 +1743,8 @@ struct clip_model_loader {
1743
1743
1744
1744
if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
1745
1745
|| ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
1746
- || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL) {
1746
+ || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
1747
+ || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
1747
1748
n_layer += 1 ;
1748
1749
}
1749
1750
@@ -2856,7 +2857,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2856
2857
}
2857
2858
return true ;
2858
2859
}
2859
- else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
2860
+ else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL ) {
2860
2861
clip_image_u8 resized;
2861
2862
auto patch_size = clip_get_patch_size (ctx) * 2 ;
2862
2863
int nx = ceil ((float )img->nx / patch_size) * patch_size;
@@ -3255,7 +3256,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3255
3256
else {
3256
3257
// non-minicpmv models
3257
3258
3258
- if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
3259
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL ) {
3259
3260
// pw * ph = number of tokens output by ViT after apply patch merger
3260
3261
// ipw * ipw = number of vision token been processed inside ViT
3261
3262
const int merge_ratio = 2 ;
@@ -3395,7 +3396,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3395
3396
}
3396
3397
}
3397
3398
3398
- if (use_window_attn && ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3399
+ if (use_window_attn && ( ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL) ) {
3399
3400
struct ggml_tensor * window_idx = ggml_graph_get_tensor (gf, " window_idx" );
3400
3401
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor (gf, " inv_window_idx" );
3401
3402
struct ggml_tensor * window_mask = ggml_graph_get_tensor (gf, " window_mask" );
0 commit comments