Skip to content

Commit b8d9dcb

Browse files
committed
initial commit
1 parent ec7afad commit b8d9dcb

18 files changed

+3904
-0
lines changed

docs/source/en/_toctree.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,8 @@
886886
title: InstructBLIP
887887
- local: model_doc/instructblipvideo
888888
title: InstructBlipVideo
889+
- local: model_doc/internvl
890+
title: InternVL
889891
- local: model_doc/kosmos-2
890892
title: KOSMOS-2
891893
- local: model_doc/layoutlm
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License.
11+
12+
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13+
rendered properly in your Markdown viewer.
14+
15+
-->
16+
17+
# InternVL
18+
19+
## Overview
20+
21+
The InternVL model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
22+
<INSERT SHORT SUMMARY HERE>
23+
24+
The abstract from the paper is the following:
25+
26+
*<INSERT PAPER ABSTRACT HERE>*
27+
28+
Tips:
29+
30+
<INSERT TIPS ABOUT MODEL HERE>
31+
32+
This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
33+
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
34+
35+
36+
## InternVLConfig
37+
38+
[[autodoc]] InternVLConfig
39+
40+
## InternVLModel
41+
42+
[[autodoc]] InternVLModel
43+
- forward
44+
45+
## InternVLImageProcessor
46+
47+
[[autodoc]] InternVLImageProcessor
48+
- preprocess
49+
50+
## InternVLImageProcessorFast
51+
52+
[[autodoc]] InternVLImageProcessorFast
53+
- preprocess
54+
55+
## InternVLProcessor
56+
57+
[[autodoc]] InternVLProcessor

src/transformers/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,7 @@
695695
"Pix2StructVisionConfig",
696696
],
697697
"models.pixtral": ["PixtralProcessor", "PixtralVisionConfig"],
698+
"models.internvl": ["InternVLProcessor", "InternVLConfig"],
698699
"models.plbart": ["PLBartConfig"],
699700
"models.poolformer": ["PoolFormerConfig"],
700701
"models.pop2piano": ["Pop2PianoConfig"],
@@ -1266,6 +1267,7 @@
12661267
_import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
12671268
_import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
12681269
_import_structure["models.pixtral"].append("PixtralImageProcessor")
1270+
_import_structure["models.internvl"].append("InternVLImageProcessor")
12691271
_import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
12701272
_import_structure["models.pvt"].extend(["PvtImageProcessor"])
12711273
_import_structure["models.qwen2_5_vl"].extend(["Qwen2_5_VLImageProcessor"])
@@ -1304,6 +1306,7 @@
13041306
_import_structure["models.deformable_detr"].append("DeformableDetrImageProcessorFast")
13051307
_import_structure["models.detr"].append("DetrImageProcessorFast")
13061308
_import_structure["models.pixtral"].append("PixtralImageProcessorFast")
1309+
_import_structure["models.internvl"].append("InternVLImageProcessorFast")
13071310
_import_structure["models.qwen2_vl"].append("Qwen2VLImageProcessorFast")
13081311
_import_structure["models.rt_detr"].append("RTDetrImageProcessorFast")
13091312
_import_structure["models.vit"].append("ViTImageProcessorFast")
@@ -3225,6 +3228,7 @@
32253228
]
32263229
)
32273230
_import_structure["models.pixtral"].extend(["PixtralPreTrainedModel", "PixtralVisionModel"])
3231+
_import_structure["models.internvl"].extend(["InternVLPreTrainedModel", "InternVLModel"])
32283232
_import_structure["models.plbart"].extend(
32293233
[
32303234
"PLBartForCausalLM",
@@ -5589,6 +5593,10 @@
55895593
InstructBlipVideoQFormerConfig,
55905594
InstructBlipVideoVisionConfig,
55915595
)
5596+
from .models.internvl import (
5597+
InternVLConfig,
5598+
InternVLProcessor,
5599+
)
55925600
from .models.jamba import JambaConfig
55935601
from .models.jetmoe import JetMoeConfig
55945602
from .models.kosmos2 import (
@@ -6348,6 +6356,7 @@
63486356
from .models.idefics3 import Idefics3ImageProcessor
63496357
from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
63506358
from .models.instructblipvideo import InstructBlipVideoImageProcessor
6359+
from .models.internvl import InternVLImageProcessor
63516360
from .models.layoutlmv2 import (
63526361
LayoutLMv2FeatureExtractor,
63536362
LayoutLMv2ImageProcessor,
@@ -6419,6 +6428,7 @@
64196428
from .image_processing_utils_fast import BaseImageProcessorFast
64206429
from .models.deformable_detr import DeformableDetrImageProcessorFast
64216430
from .models.detr import DetrImageProcessorFast
6431+
from .models.internvl import InternVLImageProcessorFast
64226432
from .models.pixtral import PixtralImageProcessorFast
64236433
from .models.qwen2_vl import Qwen2VLImageProcessorFast
64246434
from .models.rt_detr import RTDetrImageProcessorFast
@@ -7491,6 +7501,10 @@
74917501
InstructBlipVideoQFormerModel,
74927502
InstructBlipVideoVisionModel,
74937503
)
7504+
from .models.internvl import (
7505+
InternVLModel,
7506+
InternVLPreTrainedModel,
7507+
)
74947508
from .models.jamba import (
74957509
JambaForCausalLM,
74967510
JambaForSequenceClassification,

src/transformers/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@
130130
informer,
131131
instructblip,
132132
instructblipvideo,
133+
internvl,
133134
jamba,
134135
jetmoe,
135136
kosmos2,

src/transformers/models/auto/configuration_auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@
151151
("informer", "InformerConfig"),
152152
("instructblip", "InstructBlipConfig"),
153153
("instructblipvideo", "InstructBlipVideoConfig"),
154+
("internvl", "InternVLConfig"),
154155
("jamba", "JambaConfig"),
155156
("jetmoe", "JetMoeConfig"),
156157
("jukebox", "JukeboxConfig"),
@@ -478,6 +479,7 @@
478479
("informer", "Informer"),
479480
("instructblip", "InstructBLIP"),
480481
("instructblipvideo", "InstructBlipVideo"),
482+
("internvl", "InternVL"),
481483
("jamba", "Jamba"),
482484
("jetmoe", "JetMoe"),
483485
("jukebox", "Jukebox"),

src/transformers/models/auto/modeling_auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@
143143
("ijepa", "IJepaModel"),
144144
("imagegpt", "ImageGPTModel"),
145145
("informer", "InformerModel"),
146+
("internvl", "InternVLModel"),
146147
("jamba", "JambaModel"),
147148
("jetmoe", "JetMoeModel"),
148149
("jukebox", "JukeboxModel"),
@@ -815,6 +816,7 @@
815816
("idefics2", "Idefics2ForConditionalGeneration"),
816817
("idefics3", "Idefics3ForConditionalGeneration"),
817818
("instructblip", "InstructBlipForConditionalGeneration"),
819+
("internvl", "LlavaForConditionalGeneration"),
818820
("kosmos-2", "Kosmos2ForConditionalGeneration"),
819821
("llava", "LlavaForConditionalGeneration"),
820822
("llava_next", "LlavaNextForConditionalGeneration"),

src/transformers/models/auto/processing_auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
("idefics3", "Idefics3Processor"),
7272
("instructblip", "InstructBlipProcessor"),
7373
("instructblipvideo", "InstructBlipVideoProcessor"),
74+
("internvl", "PixtralProcessor"),
7475
("kosmos-2", "Kosmos2Processor"),
7576
("layoutlmv2", "LayoutLMv2Processor"),
7677
("layoutlmv3", "LayoutLMv3Processor"),

src/transformers/models/auto/tokenization_auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@
235235
("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
236236
("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
237237
("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
238+
("internvl", (None, "InternVLTokenizer" if is_tokenizers_available() else None)),
238239
(
239240
"jamba",
240241
(
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright 2025 The HuggingFace Team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from typing import TYPE_CHECKING
15+
16+
from ...utils import _LazyModule
17+
from ...utils.import_utils import define_import_structure
18+
19+
20+
if TYPE_CHECKING:
21+
from .configuration_internvl import *
22+
from .image_processing_internvl import *
23+
from .image_processing_internvl_fast import *
24+
from .modeling_internvl import *
25+
from .processing_internvl import *
26+
else:
27+
import sys
28+
29+
_file = globals()["__file__"]
30+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

0 commit comments

Comments
 (0)