[Bugfix][Model] Fix DeepSeek-OCR TensorSchema crash on empty images_crop (vllm-project#36670)

ketyi · Lucaskabela · commit 9d537f4e6661 · 2026-03-17T13:54:28.000-07:00
Signed-off-by: István Ketykó &lt;istvan.ketyko@gmail.com&gt;
diff --git a/tests/models/multimodal/processing/test_deepseek_ocr.py b/tests/models/multimodal/processing/test_deepseek_ocr.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Regression test for DeepSeek-OCR TensorSchema validation with empty images_crop.
+
+When using the Gundam preset (BASE_SIZE=1024, IMAGE_SIZE=640, CROP_MODE=True),
+images that are small enough to not require cropping produce an empty
+images_crop tensor with shape (0, 3, 640, 640). The _parse_and_validate_image_input
+method must correctly read image_size from this tensor's shape rather than
+falling back to base_size, which would cause a TensorSchema mismatch.
+
+Run with:
+  pytest tests/models/multimodal/processing/test_deepseek_ocr.py -v
+"""
+
+import pytest
+from PIL import Image
+from transformers import AutoTokenizer
+
+from vllm.model_executor.models.deepseek_ocr import DeepseekOCRImagePixelInputs
+from vllm.transformers_utils.processors.deepseek_ocr import DeepseekOCRProcessor
+
+MODEL_ID = "deepseek-ai/DeepSeek-OCR"
+
+
+@pytest.fixture(scope="module")
+def processor():
+    """Load the DeepseekOCRProcessor with tokenizer from HuggingFace."""
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    return DeepseekOCRProcessor(tokenizer=tokenizer)
+
+
+class TestDeepseekOCREmptyImagesCrop:
+    """Verify TensorSchema validation handles empty images_crop correctly."""
+
+    def test_empty_images_crop_small_image(self, processor):
+        """A small image (<=640px) produces empty images_crop and should
+        not crash the TensorSchema validation.
+
+        Previously, the code used ``numel() > 0`` to decide whether to read
+        image_size from the tensor shape. When numel()==0, it fell back to
+        base_size=1024, mismatching the actual tensor dim of 640.
+        """
+        # Small image: both dims <= IMAGE_SIZE (640) → no crops
+        small_image = Image.new("RGB", (100, 100), color="red")
+
+        result = processor(
+            prompt="<image>\nDescribe this image.",
+            images=[small_image],
+        )
+
+        pixel_values = result["pixel_values"]
+        images_crop = result["images_crop"]
+        images_spatial_crop = result["images_spatial_crop"]
+
+        # Processor must produce an empty crop tensor for a small image
+        assert images_crop.shape[0] == 0
+
+        base_size = pixel_values.shape[-1]
+        image_size = images_crop.shape[-1] if images_crop is not None else base_size
+
+        # This should NOT raise ValueError
+        schema = DeepseekOCRImagePixelInputs(
+            type="pixel_values",
+            data=pixel_values,
+            images_crop=images_crop,
+            images_spatial_crop=images_spatial_crop,
+            resolve_bindings={
+                "base_size": base_size,
+                "image_size": image_size,
+            },
+        )
+
+        assert schema.data.shape == (1, 3, 1024, 1024)
+        assert schema.images_crop.shape == (0, 3, 640, 640)
+
+    def test_populated_images_crop_large_image(self, processor):
+        """A large image (>640px) produces populated images_crop."""
+        # Large image: exceeds IMAGE_SIZE (640) → dynamic crop tiles
+        large_image = Image.new("RGB", (1200, 800), color="blue")
+
+        result = processor(
+            prompt="<image>\nDescribe this image.",
+            images=[large_image],
+        )
+
+        pixel_values = result["pixel_values"]
+        images_crop = result["images_crop"]
+        images_spatial_crop = result["images_spatial_crop"]
+
+        assert images_crop.shape[0] > 0
+
+        base_size = pixel_values.shape[-1]
+        image_size = images_crop.shape[-1]
+
+        schema = DeepseekOCRImagePixelInputs(
+            type="pixel_values",
+            data=pixel_values,
+            images_crop=images_crop,
+            images_spatial_crop=images_spatial_crop,
+            resolve_bindings={
+                "base_size": base_size,
+                "image_size": image_size,
+            },
+        )
+
+        assert schema.data.shape == (1, 3, 1024, 1024)
+        assert schema.images_crop.shape[-1] == 640
+
+    def test_mismatched_image_size_raises(self, processor):
+        """Deliberately wrong image_size binding should still be caught
+        by TensorSchema validation."""
+        small_image = Image.new("RGB", (100, 100), color="green")
+
+        result = processor(
+            prompt="<image>\nDescribe this image.",
+            images=[small_image],
+        )
+
+        pixel_values = result["pixel_values"]
+        images_crop = result["images_crop"]
+        images_spatial_crop = result["images_spatial_crop"]
+
+        with pytest.raises(ValueError, match="images_crop"):
+            DeepseekOCRImagePixelInputs(
+                type="pixel_values",
+                data=pixel_values,
+                images_crop=images_crop,
+                images_spatial_crop=images_spatial_crop,
+                resolve_bindings={
+                    "base_size": 1024,
+                    "image_size": 1024,  # Wrong! Tensor has 640
+                },
+            )
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
@@ -452,10 +452,7 @@ def _parse_and_validate_image_input(
         # support arbitrary resolutions via pos-encoding interpolation,
         # so Tiny/Small/Base/Large variants all work with the same weights.
         base_size = pixel_values.shape[-1]
-        if images_crop is not None and images_crop.numel() > 0:
-            image_size = images_crop.shape[-1]
-        else:
-            image_size = base_size
+        image_size = images_crop.shape[-1] if images_crop is not None else base_size
 
         return DeepseekOCRImagePixelInputs(
             type="pixel_values",