@@ -101,7 +101,6 @@ class DeepseekVLV2TextConfig(DeepseekV2Config):
101101
102102class DeepseekVLV2Config (PretrainedConfig ):
103103 model_type = "deepseek_vl_v2"
104- architectures : list [str ] | None = None
105104
106105 tile_tag : str = "2D"
107106 global_view_pos : str = "head"
@@ -114,26 +113,20 @@ def __init__(
114113 candidate_resolutions : tuple [tuple [int , int ]] = ((384 , 384 ),),
115114 ** kwargs ,
116115 ):
117- if "architectures" not in kwargs :
118- kwargs ["architectures" ] = ["DeepseekVLV2ForCausalLM" ]
116+ architectures = kwargs .setdefault ("architectures" , ["DeepseekVLV2ForCausalLM" ])
119117
120- vision_config = kwargs .pop ("vision_config" , {})
121- self .vision_config = VisionEncoderConfig (** vision_config )
122-
123- projector_config = kwargs .pop ("projector_config" , {})
124- self .projector_config = MlpProjectorConfig (** projector_config )
125-
126- language_config = kwargs .pop ("language_config" , {})
127- self .text_config = DeepseekVLV2TextConfig (** language_config )
118+ self .vision_config = VisionEncoderConfig (** kwargs .pop ("vision_config" , {}))
119+ self .projector_config = MlpProjectorConfig (** kwargs .pop ("projector_config" , {}))
120+ self .text_config = DeepseekVLV2TextConfig (** kwargs .pop ("language_config" , {}))
128121
129122 self .tile_tag = tile_tag
130123 self .global_view_pos = global_view_pos
131124 self .candidate_resolutions = candidate_resolutions
132125 self .vocab_size = self .text_config .vocab_size
133126
134127 # update model_type for OCR models
135- if "DeepseekOCRForCausalLM" in kwargs [ " architectures" ] :
136- self . model_type = "deepseek_ocr"
137- elif "DeepseekOCR2ForCausalLM" in kwargs [ " architectures" ] :
138- self . model_type = "deepseek_ocr2"
128+ if "DeepseekOCRForCausalLM" in architectures :
129+ kwargs [ " model_type" ] = "deepseek_ocr"
130+ elif "DeepseekOCR2ForCausalLM" in architectures :
131+ kwargs [ " model_type" ] = "deepseek_ocr2"
139132 super ().__init__ (** kwargs )
0 commit comments