Add streaming support and custom GenerationStopper support for ApiVlmModel

cau-git · cau-git · commit c5b7363acf50 · 2025-09-26T17:00:04.000+02:00
Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
@@ -104,3 +104,6 @@ class ApiVlmOptions(BaseVlmOptions):
     timeout: float = 60
     concurrency: int = 1
     response_format: ResponseFormat
+
+    stop_strings: List[str] = []
+    custom_stopping_criteria: List[Union[GenerationStopper]] = []
diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py
@@ -1,12 +1,18 @@
 from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 
+from transformers import StoppingCriteria
+
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
 from docling.exceptions import OperationNotAllowed
 from docling.models.base_model import BasePageModel
-from docling.utils.api_image_request import api_image_request
+from docling.models.utils.generation_utils import GenerationStopper
+from docling.utils.api_image_request import (
+    api_image_request,
+    api_image_request_streaming,
+)
 from docling.utils.profiling import TimeRecorder
 
 
@@ -41,19 +47,43 @@ def _vlm_request(page):
             assert page._backend is not None
             if not page._backend.is_valid():
                 return page
-            else:
-                with TimeRecorder(conv_res, "vlm"):
-                    assert page.size is not None
 
-                    hi_res_image = page.get_image(
-                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
-                    )
-                    assert hi_res_image is not None
-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
+            with TimeRecorder(conv_res, "vlm"):
+                assert page.size is not None
+
+                hi_res_image = page.get_image(
+                    scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                )
+                assert hi_res_image is not None
+                if hi_res_image and hi_res_image.mode != "RGB":
+                    hi_res_image = hi_res_image.convert("RGB")
 
-                    prompt = self.vlm_options.build_prompt(page.parsed_page)
+                prompt = self.vlm_options.build_prompt(page.parsed_page)
+
+                if self.vlm_options.custom_stopping_criteria:
+                    # Instantiate any GenerationStopper classes before passing to streaming
+                    instantiated_stoppers = []
+                    for criteria in self.vlm_options.custom_stopping_criteria:
+                        if isinstance(criteria, GenerationStopper):
+                            instantiated_stoppers.append(criteria)
+                        elif isinstance(criteria, type) and issubclass(
+                            criteria, GenerationStopper
+                        ):
+                            instantiated_stoppers.append(criteria())
+                        # Skip non-GenerationStopper criteria (should have been caught in validation)
+
+                    # Streaming path with early abort support
+                    page_tags = api_image_request_streaming(
+                        image=hi_res_image,
+                        prompt=prompt,
+                        url=self.vlm_options.url,
+                        timeout=self.timeout,
+                        headers=self.vlm_options.headers,
+                        generation_stoppers=instantiated_stoppers,
+                        **self.params,
+                    )
+                else:
+                    # Non-streaming fallback (existing behavior)
                     page_tags = api_image_request(
                         image=hi_res_image,
                         prompt=prompt,
@@ -63,10 +93,10 @@ def _vlm_request(page):
                         **self.params,
                     )
 
-                    page_tags = self.vlm_options.decode_response(page_tags)
-                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+                page_tags = self.vlm_options.decode_response(page_tags)
+                page.predictions.vlm_response = VlmPrediction(text=page_tags)
 
-                return page
+            return page
 
         with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
             yield from executor.map(_vlm_request, page_batch)
diff --git a/docling/utils/api_image_request.py b/docling/utils/api_image_request.py
@@ -1,13 +1,15 @@
 import base64
+import json
 import logging
 from io import BytesIO
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 import requests
 from PIL import Image
 from pydantic import AnyUrl
 
 from docling.datamodel.base_models import OpenAiApiResponse
+from docling.models.utils.generation_utils import GenerationStopper
 
 _log = logging.getLogger(__name__)
 
@@ -59,3 +61,99 @@ def api_image_request(
     api_resp = OpenAiApiResponse.model_validate_json(r.text)
     generated_text = api_resp.choices[0].message.content.strip()
     return generated_text
+
+
+def api_image_request_streaming(
+    image: Image.Image,
+    prompt: str,
+    url: AnyUrl,
+    *,
+    timeout: float = 20,
+    headers: Optional[Dict[str, str]] = None,
+    generation_stoppers: List[GenerationStopper] = [],
+    **params,
+) -> str:
+    """
+    Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
+    Parses SSE lines: 'data: {json}\\n\\n', terminated by 'data: [DONE]'.
+    Accumulates text and calls stopper.should_stop(window) as partials arrive.
+    If stopper triggers, the HTTP connection is closed to abort server-side generation.
+    """
+    img_io = BytesIO()
+    image.save(img_io, "PNG")
+    image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                },
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+
+    payload = {
+        "messages": messages,
+        "stream": True,  # <-- critical for SSE streaming
+        **params,
+    }
+
+    # Some servers require Accept: text/event-stream for SSE.
+    # It's safe to set it; OpenAI-compatible servers tolerate it.
+    hdrs = {"Accept": "text/event-stream", **(headers or {})}
+
+    # Stream the HTTP response
+    with requests.post(
+        str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
+    ) as r:
+        if not r.ok:
+            _log.error(f"Error calling the API (streaming). Response was {r.text}")
+        r.raise_for_status()
+
+        full_text = []
+        for raw_line in r.iter_lines(decode_unicode=True):
+            if not raw_line:  # keep-alives / blank lines
+                continue
+            if not raw_line.startswith("data:"):
+                # Some proxies inject comments; ignore anything not starting with 'data:'
+                continue
+
+            data = raw_line[len("data:") :].strip()
+            if data == "[DONE]":
+                break
+
+            try:
+                obj = json.loads(data)
+            except json.JSONDecodeError:
+                _log.info("Skipping non-JSON SSE chunk: %r", data[:200])
+                continue
+
+            # OpenAI-compatible delta format
+            # obj["choices"][0]["delta"]["content"] may be None or missing (e.g., tool calls)
+            try:
+                delta = obj["choices"][0].get("delta") or {}
+                piece = delta.get("content") or ""
+            except (KeyError, IndexError) as e:
+                _log.debug("Unexpected SSE chunk shape: %s", e)
+                piece = ""
+
+            if piece:
+                full_text.append(piece)
+                for stopper in generation_stoppers:
+                    # Respect stopper's lookback window. We use a simple string window which
+                    # works with your regex-based stopper; no tokenizer needed.
+                    lookback = max(1, stopper.lookback_tokens())
+                    window = "".join(full_text)[-lookback:]
+                    if stopper.should_stop(window):
+                        # Closing the socket signals cancel to vLLM/OpenAI-compatible servers.
+                        # vLLM aborts the request when the client disconnects. :contentReference[oaicite:2]{index=2}
+                        try:
+                            r.close()
+                        finally:
+                            break
+
+        return "".join(full_text)