huggingface · zucchini-nlp · Oct 3, 2025 · Sep 30, 2025 · Oct 1, 2025 · Oct 3, 2025
diff --git a/docs/source/en/auto_docstring.md b/docs/source/en/auto_docstring.md
@@ -292,7 +292,7 @@ The `@auto_docstring` decorator automatically generates docstrings by:
 
 8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
 
-    Currently only supported for [`FastImageProcessorKwargs`].
+    Currently only supported for [`ImagesKwargs`].
 
 ## Best practices
 

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
@@ -20,7 +20,8 @@
 
 from .image_processing_base import BatchFeature, ImageProcessingMixin
 from .image_transforms import center_crop, normalize, rescale
-from .image_utils import ChannelDimension, get_image_size
+from .image_utils import ChannelDimension, ImageInput, get_image_size
+from .processing_utils import ImagesKwargs, Unpack
 from .utils import logging
 from .utils.import_utils import requires
 
@@ -36,6 +37,8 @@
 
 @requires(backends=("vision",))
 class BaseImageProcessor(ImageProcessingMixin):
+    valid_kwargs = ImagesKwargs
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -46,9 +49,9 @@ def is_fast(self) -> bool:
         """
         return False
 
-    def __call__(self, images, **kwargs) -> BatchFeature:
+    def __call__(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
         """Preprocess an image or a batch of images."""
-        return self.preprocess(images, **kwargs)
+        return self.preprocess(images, *args, **kwargs)
 
     def preprocess(self, images, **kwargs) -> BatchFeature:
         raise NotImplementedError("Each image processor must implement its own preprocess method")

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
@@ -15,7 +15,7 @@
 from collections.abc import Iterable
 from copy import deepcopy
 from functools import lru_cache, partial
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 
@@ -40,7 +40,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from .processing_utils import Unpack
+from .processing_utils import ImagesKwargs, Unpack
 from .utils import (
     TensorType,
     auto_docstring,
@@ -165,28 +165,6 @@ def divide_to_patches(
     return patches
 
 
-class DefaultFastImageProcessorKwargs(TypedDict, total=False):
-    do_resize: Optional[bool]
-    size: Optional[dict[str, int]]
-    default_to_square: Optional[bool]
-    resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
-    do_center_crop: Optional[bool]
-    crop_size: Optional[dict[str, int]]
-    do_rescale: Optional[bool]
-    rescale_factor: Optional[Union[int, float]]
-    do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float]]]
-    image_std: Optional[Union[float, list[float]]]
-    do_pad: Optional[bool]
-    pad_size: Optional[dict[str, int]]
-    do_convert_rgb: Optional[bool]
-    return_tensors: Optional[Union[str, TensorType]]
-    data_format: Optional[ChannelDimension]
-    input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional["torch.device"]
-    disable_grouping: Optional[bool]
-
-
 @auto_docstring
 class BaseImageProcessorFast(BaseImageProcessor):
     resample = None
@@ -208,10 +186,10 @@ class BaseImageProcessorFast(BaseImageProcessor):
     input_data_format = None
     device = None
     model_input_names = ["pixel_values"]
-    valid_kwargs = DefaultFastImageProcessorKwargs
+    valid_kwargs = ImagesKwargs
     unused_kwargs = None
 
-    def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
         super().__init__(**kwargs)
         kwargs = self.filter_out_unused_kwargs(kwargs)
         size = kwargs.pop("size", self.size)
@@ -730,11 +708,8 @@ def _validate_preprocess_kwargs(
             data_format=data_format,
         )
 
-    def __call__(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
-        return self.preprocess(images, *args, **kwargs)
-
     @auto_docstring
-    def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
         # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
         # Set default kwargs from self. This ensures that if a kwarg is not provided
@@ -767,7 +742,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
+        **kwargs: Unpack[ImagesKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.

diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
@@ -959,8 +959,6 @@ def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
         images: Optional[ImageInput] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[AriaProcessorKwargs],
     ) -> BatchFeature:
         """

diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
@@ -85,8 +85,6 @@ def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
         images: Optional[ImageInput] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[AriaProcessorKwargs],
     ) -> BatchFeature:
         """

diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py
@@ -19,18 +19,11 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, make_flat_list_of_images
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
-class AyaVisionImagesKwargs(ImagesKwargs, total=False):
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-
-
 class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: AyaVisionImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
@@ -140,8 +133,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[AyaVisionProcessorKwargs],
     ) -> BatchFeature:
         """

diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
@@ -33,6 +33,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -54,6 +55,17 @@
 logger = logging.get_logger(__name__)
 
 
+class BeitImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_reduce_labels: Optional[bool]
+
+
 @requires(backends=("vision",))
 class BeitImageProcessor(BaseImageProcessor):
     r"""
@@ -99,6 +111,7 @@ class BeitImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = BeitImageProcessorKwargs
 
     @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS)
     def __init__(

diff --git a/src/transformers/models/beit/image_processing_beit_fast.py b/src/transformers/models/beit/image_processing_beit_fast.py
@@ -21,7 +21,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -40,6 +39,7 @@
     auto_docstring,
     is_torchvision_v2_available,
 )
+from .image_processing_beit import BeitImageProcessorKwargs
 
 
 if is_torchvision_v2_available():
@@ -48,17 +48,6 @@
     from torchvision.transforms import functional as F
 
 
-class BeitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_reduce_labels: Optional[bool]
-
-
 @auto_docstring
 class BeitImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BICUBIC
@@ -72,9 +61,9 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_reduce_labels = False
-    valid_kwargs = BeitFastImageProcessorKwargs
+    valid_kwargs = BeitImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[BeitFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[BeitImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def reduce_label(self, labels: list["torch.Tensor"]):
@@ -92,7 +81,7 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[BeitFastImageProcessorKwargs],
+        **kwargs: Unpack[BeitImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -107,7 +96,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[BeitFastImageProcessorKwargs],
+        **kwargs: Unpack[BeitImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.

diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
@@ -36,7 +36,6 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False):
             "return_length": False,
             "verbose": True,
         },
-        "images_kwargs": {},
     }
 
 
@@ -67,8 +66,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[BlipProcessorKwargs],
     ) -> BatchEncoding:
         """

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
@@ -41,7 +41,6 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False):
             "return_length": False,
             "verbose": True,
         },
-        "images_kwargs": {},
     }
 
 
@@ -81,8 +80,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Blip2ProcessorKwargs],
     ) -> BatchEncoding:
         """

diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -35,6 +35,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -122,6 +123,10 @@ def get_resize_output_image_size(
     return new_height, new_width
 
 
+class BridgeTowerImageProcessorKwargs(ImagesKwargs):
+    size_divisor: Optional[int]
+
+
 class BridgeTowerImageProcessor(BaseImageProcessor):
     r"""
     Constructs a BridgeTower image processor.
@@ -169,6 +174,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = BridgeTowerImageProcessorKwargs
 
     def __init__(
         self,

diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py b/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     ImageInput,
     SizeDict,
     TensorType,
@@ -32,6 +31,7 @@
 )
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
 from ...utils import auto_docstring, is_torchvision_v2_available
+from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs
 
 
 if is_torchvision_v2_available():
@@ -90,17 +90,6 @@ def get_resize_output_image_size(
     return new_height, new_width
 
 
-class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        size_divisor (`int`, *optional*, defaults to 32):
-            The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
-            is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
-    """
-
-    size_divisor: Optional[int]
-
-
 @auto_docstring
 class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BICUBIC
@@ -115,14 +104,14 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
     do_normalize = True
     do_pad = True
     size_divisor = 32
-    valid_kwargs = BridgeTowerFastImageProcessorKwargs
+    valid_kwargs = BridgeTowerImageProcessorKwargs
     model_input_names = ["pixel_values", "pixel_mask"]
 
-    def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(

diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -16,17 +16,10 @@
 Processor class for BridgeTower.
 """
 
-from typing import Optional
-
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-
-
-class BridgeTowerImagesKwargs(ImagesKwargs):
-    size_divisor: Optional[int]
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 
 
 class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: BridgeTowerImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,