Skip to content

Commit 9d537f4

Browse files
ketyiLucaskabela
authored andcommitted
[Bugfix][Model] Fix DeepSeek-OCR TensorSchema crash on empty images_crop (vllm-project#36670)
Signed-off-by: István Ketykó <istvan.ketyko@gmail.com>
1 parent 5e0b37f commit 9d537f4

2 files changed

Lines changed: 135 additions & 4 deletions

File tree

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""
4+
Regression test for DeepSeek-OCR TensorSchema validation with empty images_crop.
5+
6+
When using the Gundam preset (BASE_SIZE=1024, IMAGE_SIZE=640, CROP_MODE=True),
7+
images that are small enough to not require cropping produce an empty
8+
images_crop tensor with shape (0, 3, 640, 640). The _parse_and_validate_image_input
9+
method must correctly read image_size from this tensor's shape rather than
10+
falling back to base_size, which would cause a TensorSchema mismatch.
11+
12+
Run with:
13+
pytest tests/models/multimodal/processing/test_deepseek_ocr.py -v
14+
"""
15+
16+
import pytest
17+
from PIL import Image
18+
from transformers import AutoTokenizer
19+
20+
from vllm.model_executor.models.deepseek_ocr import DeepseekOCRImagePixelInputs
21+
from vllm.transformers_utils.processors.deepseek_ocr import DeepseekOCRProcessor
22+
23+
MODEL_ID = "deepseek-ai/DeepSeek-OCR"
24+
25+
26+
@pytest.fixture(scope="module")
27+
def processor():
28+
"""Load the DeepseekOCRProcessor with tokenizer from HuggingFace."""
29+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
30+
return DeepseekOCRProcessor(tokenizer=tokenizer)
31+
32+
33+
class TestDeepseekOCREmptyImagesCrop:
34+
"""Verify TensorSchema validation handles empty images_crop correctly."""
35+
36+
def test_empty_images_crop_small_image(self, processor):
37+
"""A small image (<=640px) produces empty images_crop and should
38+
not crash the TensorSchema validation.
39+
40+
Previously, the code used ``numel() > 0`` to decide whether to read
41+
image_size from the tensor shape. When numel()==0, it fell back to
42+
base_size=1024, mismatching the actual tensor dim of 640.
43+
"""
44+
# Small image: both dims <= IMAGE_SIZE (640) → no crops
45+
small_image = Image.new("RGB", (100, 100), color="red")
46+
47+
result = processor(
48+
prompt="<image>\nDescribe this image.",
49+
images=[small_image],
50+
)
51+
52+
pixel_values = result["pixel_values"]
53+
images_crop = result["images_crop"]
54+
images_spatial_crop = result["images_spatial_crop"]
55+
56+
# Processor must produce an empty crop tensor for a small image
57+
assert images_crop.shape[0] == 0
58+
59+
base_size = pixel_values.shape[-1]
60+
image_size = images_crop.shape[-1] if images_crop is not None else base_size
61+
62+
# This should NOT raise ValueError
63+
schema = DeepseekOCRImagePixelInputs(
64+
type="pixel_values",
65+
data=pixel_values,
66+
images_crop=images_crop,
67+
images_spatial_crop=images_spatial_crop,
68+
resolve_bindings={
69+
"base_size": base_size,
70+
"image_size": image_size,
71+
},
72+
)
73+
74+
assert schema.data.shape == (1, 3, 1024, 1024)
75+
assert schema.images_crop.shape == (0, 3, 640, 640)
76+
77+
def test_populated_images_crop_large_image(self, processor):
78+
"""A large image (>640px) produces populated images_crop."""
79+
# Large image: exceeds IMAGE_SIZE (640) → dynamic crop tiles
80+
large_image = Image.new("RGB", (1200, 800), color="blue")
81+
82+
result = processor(
83+
prompt="<image>\nDescribe this image.",
84+
images=[large_image],
85+
)
86+
87+
pixel_values = result["pixel_values"]
88+
images_crop = result["images_crop"]
89+
images_spatial_crop = result["images_spatial_crop"]
90+
91+
assert images_crop.shape[0] > 0
92+
93+
base_size = pixel_values.shape[-1]
94+
image_size = images_crop.shape[-1]
95+
96+
schema = DeepseekOCRImagePixelInputs(
97+
type="pixel_values",
98+
data=pixel_values,
99+
images_crop=images_crop,
100+
images_spatial_crop=images_spatial_crop,
101+
resolve_bindings={
102+
"base_size": base_size,
103+
"image_size": image_size,
104+
},
105+
)
106+
107+
assert schema.data.shape == (1, 3, 1024, 1024)
108+
assert schema.images_crop.shape[-1] == 640
109+
110+
def test_mismatched_image_size_raises(self, processor):
111+
"""Deliberately wrong image_size binding should still be caught
112+
by TensorSchema validation."""
113+
small_image = Image.new("RGB", (100, 100), color="green")
114+
115+
result = processor(
116+
prompt="<image>\nDescribe this image.",
117+
images=[small_image],
118+
)
119+
120+
pixel_values = result["pixel_values"]
121+
images_crop = result["images_crop"]
122+
images_spatial_crop = result["images_spatial_crop"]
123+
124+
with pytest.raises(ValueError, match="images_crop"):
125+
DeepseekOCRImagePixelInputs(
126+
type="pixel_values",
127+
data=pixel_values,
128+
images_crop=images_crop,
129+
images_spatial_crop=images_spatial_crop,
130+
resolve_bindings={
131+
"base_size": 1024,
132+
"image_size": 1024, # Wrong! Tensor has 640
133+
},
134+
)

vllm/model_executor/models/deepseek_ocr.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -452,10 +452,7 @@ def _parse_and_validate_image_input(
452452
# support arbitrary resolutions via pos-encoding interpolation,
453453
# so Tiny/Small/Base/Large variants all work with the same weights.
454454
base_size = pixel_values.shape[-1]
455-
if images_crop is not None and images_crop.numel() > 0:
456-
image_size = images_crop.shape[-1]
457-
else:
458-
image_size = base_size
455+
image_size = images_crop.shape[-1] if images_crop is not None else base_size
459456

460457
return DeepseekOCRImagePixelInputs(
461458
type="pixel_values",

0 commit comments

Comments
 (0)