huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/internvl.md‎
Lines changed: 57 additions & 0 deletions b/‎docs/source/en/model_doc/internvl.md‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎src/transformers/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎src/transformers/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/transformers/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/auto/configuration_auto.py‎
Lines changed: 2 additions & 0 deletions b/‎src/transformers/models/auto/configuration_auto.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/modeling_auto.py‎
Lines changed: 2 additions & 0 deletions b/‎src/transformers/models/auto/modeling_auto.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/auto/tokenization_auto.py‎
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/tokenization_auto.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/internvl/__init__.py‎
Lines changed: 30 additions & 0 deletions b/‎src/transformers/models/internvl/__init__.py‎
Lines changed: 30 additions & 0 deletions
@@ -886,6 +886,8 @@
         title: InstructBLIP
       - local: model_doc/instructblipvideo
         title: InstructBlipVideo
+      - local: model_doc/internvl
+        title: InternVL
       - local: model_doc/kosmos-2
         title: KOSMOS-2
       - local: model_doc/layoutlm
 
@@ -0,0 +1,57 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# InternVL
+
+## Overview
+
+The InternVL model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## InternVLConfig
+
+[[autodoc]] InternVLConfig
+
+## InternVLModel
+
+[[autodoc]] InternVLModel
+    - forward
+
+## InternVLImageProcessor
+
+[[autodoc]] InternVLImageProcessor
+    - preprocess
+
+## InternVLImageProcessorFast
+
+[[autodoc]] InternVLImageProcessorFast
+    - preprocess
+
+## InternVLProcessor
+
+[[autodoc]] InternVLProcessor
@@ -695,6 +695,7 @@
         "Pix2StructVisionConfig",
     ],
     "models.pixtral": ["PixtralProcessor", "PixtralVisionConfig"],
+    "models.internvl": ["InternVLProcessor", "InternVLConfig"],
     "models.plbart": ["PLBartConfig"],
     "models.poolformer": ["PoolFormerConfig"],
     "models.pop2piano": ["Pop2PianoConfig"],
@@ -1266,6 +1267,7 @@
     _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
     _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
     _import_structure["models.pixtral"].append("PixtralImageProcessor")
+    _import_structure["models.internvl"].append("InternVLImageProcessor")
     _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
     _import_structure["models.pvt"].extend(["PvtImageProcessor"])
     _import_structure["models.qwen2_5_vl"].extend(["Qwen2_5_VLImageProcessor"])
@@ -1304,6 +1306,7 @@
     _import_structure["models.deformable_detr"].append("DeformableDetrImageProcessorFast")
     _import_structure["models.detr"].append("DetrImageProcessorFast")
     _import_structure["models.pixtral"].append("PixtralImageProcessorFast")
+    _import_structure["models.internvl"].append("InternVLImageProcessorFast")
     _import_structure["models.qwen2_vl"].append("Qwen2VLImageProcessorFast")
     _import_structure["models.rt_detr"].append("RTDetrImageProcessorFast")
     _import_structure["models.vit"].append("ViTImageProcessorFast")
@@ -3225,6 +3228,7 @@
         ]
     )
     _import_structure["models.pixtral"].extend(["PixtralPreTrainedModel", "PixtralVisionModel"])
+    _import_structure["models.internvl"].extend(["InternVLPreTrainedModel", "InternVLModel"])
     _import_structure["models.plbart"].extend(
         [
             "PLBartForCausalLM",
@@ -5589,6 +5593,10 @@
         InstructBlipVideoQFormerConfig,
         InstructBlipVideoVisionConfig,
     )
+    from .models.internvl import (
+        InternVLConfig,
+        InternVLProcessor,
+    )
     from .models.jamba import JambaConfig
     from .models.jetmoe import JetMoeConfig
     from .models.kosmos2 import (
@@ -6348,6 +6356,7 @@
         from .models.idefics3 import Idefics3ImageProcessor
         from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
         from .models.instructblipvideo import InstructBlipVideoImageProcessor
+        from .models.internvl import InternVLImageProcessor
         from .models.layoutlmv2 import (
             LayoutLMv2FeatureExtractor,
             LayoutLMv2ImageProcessor,
@@ -6419,6 +6428,7 @@
         from .image_processing_utils_fast import BaseImageProcessorFast
         from .models.deformable_detr import DeformableDetrImageProcessorFast
         from .models.detr import DetrImageProcessorFast
+        from .models.internvl import InternVLImageProcessorFast
         from .models.pixtral import PixtralImageProcessorFast
         from .models.qwen2_vl import Qwen2VLImageProcessorFast
         from .models.rt_detr import RTDetrImageProcessorFast
@@ -7491,6 +7501,10 @@
             InstructBlipVideoQFormerModel,
             InstructBlipVideoVisionModel,
         )
+        from .models.internvl import (
+            InternVLModel,
+            InternVLPreTrainedModel,
+        )
         from .models.jamba import (
             JambaForCausalLM,
             JambaForSequenceClassification,
 
@@ -130,6 +130,7 @@
     informer,
     instructblip,
     instructblipvideo,
+    internvl,
     jamba,
     jetmoe,
     kosmos2,
 
@@ -151,6 +151,7 @@
         ("informer", "InformerConfig"),
         ("instructblip", "InstructBlipConfig"),
         ("instructblipvideo", "InstructBlipVideoConfig"),
+        ("internvl", "InternVLConfig"),
         ("jamba", "JambaConfig"),
         ("jetmoe", "JetMoeConfig"),
         ("jukebox", "JukeboxConfig"),
@@ -478,6 +479,7 @@
         ("informer", "Informer"),
         ("instructblip", "InstructBLIP"),
         ("instructblipvideo", "InstructBlipVideo"),
+        ("internvl", "InternVL"),
         ("jamba", "Jamba"),
         ("jetmoe", "JetMoe"),
         ("jukebox", "Jukebox"),
 
@@ -143,6 +143,7 @@
         ("ijepa", "IJepaModel"),
         ("imagegpt", "ImageGPTModel"),
         ("informer", "InformerModel"),
+        ("internvl", "InternVLModel"),
         ("jamba", "JambaModel"),
         ("jetmoe", "JetMoeModel"),
         ("jukebox", "JukeboxModel"),
@@ -815,6 +816,7 @@
         ("idefics2", "Idefics2ForConditionalGeneration"),
         ("idefics3", "Idefics3ForConditionalGeneration"),
         ("instructblip", "InstructBlipForConditionalGeneration"),
+        ("internvl", "LlavaForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
 
@@ -71,6 +71,7 @@
         ("idefics3", "Idefics3Processor"),
         ("instructblip", "InstructBlipProcessor"),
         ("instructblipvideo", "InstructBlipVideoProcessor"),
+        ("internvl", "PixtralProcessor"),
         ("kosmos-2", "Kosmos2Processor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
 
@@ -235,6 +235,7 @@
             ("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+            ("internvl", (None, "InternVLTokenizer" if is_tokenizers_available() else None)),
             (
                 "jamba",
                 (
 
@@ -0,0 +1,30 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_internvl import *
+    from .image_processing_internvl import *
+    from .image_processing_internvl_fast import *
+    from .modeling_internvl import *
+    from .processing_internvl import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
Original file line number	Diff line number	Diff line change
`@@ -235,6 +235,7 @@`
`235`	`235`	`("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),`
`236`	`236`	`("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),`
`237`	`237`	`("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),`
	`238`	`+ ("internvl", (None, "InternVLTokenizer" if is_tokenizers_available() else None)),`
`238`	`239`	`(`
`239`	`240`	`"jamba",`
`240`	`241`	`(`