Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def pytest_configure(config):
config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")

os.environ['DISABLE_SAFETENSORS_CONVERSION'] = 'true'
os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"


def pytest_collection_modifyitems(items):
Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/align.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ for label, score in zip(candidate_labels, probs):
## AlignConfig

[[autodoc]] AlignConfig
- from_text_vision_configs

## AlignTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/blip-2.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ If you're interested in submitting a resource to be included here, please feel f
## Blip2Config

[[autodoc]] Blip2Config
- from_vision_qformer_text_configs

## Blip2VisionConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/blip.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/exam
## BlipConfig

[[autodoc]] BlipConfig
- from_text_vision_configs

## BlipTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/chinese_clip.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ Currently, following scales of pretrained Chinese-CLIP models are available on
## ChineseCLIPConfig

[[autodoc]] ChineseCLIPConfig
- from_text_vision_configs

## ChineseCLIPTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/clap.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ print(f"Text embeddings: {text_features}")
## ClapConfig

[[autodoc]] ClapConfig
- from_text_audio_configs

## ClapTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/clip.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_
## CLIPConfig

[[autodoc]] CLIPConfig
- from_text_vision_configs

## CLIPTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/clipseg.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
## CLIPSegConfig

[[autodoc]] CLIPSegConfig
- from_text_vision_configs

## CLIPSegTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/clvp.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ Example :
## ClvpConfig

[[autodoc]] ClvpConfig
- from_sub_model_configs

## ClvpEncoderConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/groupvit.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
## GroupViTConfig

[[autodoc]] GroupViTConfig
- from_text_vision_configs

## GroupViTTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/instructblip.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
## InstructBlipConfig

[[autodoc]] InstructBlipConfig
- from_vision_qformer_text_configs

## InstructBlipVisionConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/instructblipvideo.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
## InstructBlipVideoConfig

[[autodoc]] InstructBlipVideoConfig
- from_vision_qformer_text_configs

## InstructBlipVideoVisionConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/metaclip_2.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_
## MetaClip2Config

[[autodoc]] MetaClip2Config
- from_text_vision_configs

## MetaClip2TextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/owlv2.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
## Owlv2Config

[[autodoc]] Owlv2Config
- from_text_vision_configs

## Owlv2TextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/owlvit.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object de
## OwlViTConfig

[[autodoc]] OwlViTConfig
- from_text_vision_configs

## OwlViTTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/pix2struct.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ The original code can be found [here](https://github.com/google-research/pix2str
## Pix2StructConfig

[[autodoc]] Pix2StructConfig
- from_text_vision_configs

## Pix2StructTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/siglip.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
## SiglipConfig

[[autodoc]] SiglipConfig
- from_text_vision_configs

## SiglipTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/xclip.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ If you're interested in submitting a resource to be included here, please feel f
## XCLIPConfig

[[autodoc]] XCLIPConfig
- from_text_vision_configs

## XCLIPTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/align.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ ALIGNの使用を開始するのに役立つ公式のHugging Faceとコミュニ
## AlignConfig

[[autodoc]] AlignConfig
- from_text_vision_configs

## AlignTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/altclip.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ Transformerエンコーダーに画像を与えるには、各画像を固定サ
## AltCLIPConfig

[[autodoc]] AltCLIPConfig
- from_text_vision_configs

## AltCLIPTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/blip-2.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ BLIP-2 の使用を開始するのに役立つ公式 Hugging Face およびコ
## Blip2Config

[[autodoc]] Blip2Config
- from_vision_qformer_text_configs

## Blip2VisionConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/blip.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ BLIP は、次のようなさまざまなマルチモーダル タスクを実
## BlipConfig

[[autodoc]] BlipConfig
- from_text_vision_configs

## BlipTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/chinese_clip.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ Chinese-CLIP モデルは、[OFA-Sys](https://huggingface.co/OFA-Sys) によっ
## ChineseCLIPConfig

[[autodoc]] ChineseCLIPConfig
- from_text_vision_configs

## ChineseCLIPTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/clap.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ CLAP (Contrastive Language-Audio Pretraining) は、さまざまな (音声、
## ClapConfig

[[autodoc]] ClapConfig
- from_text_audio_configs

## ClapTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/clip.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ CLIP を使い始めるのに役立つ公式 Hugging Face およびコミュニ
## CLIPConfig

[[autodoc]] CLIPConfig
- from_text_vision_configs

## CLIPTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/clipseg.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ CLIPSeg の使用を開始するのに役立つ、公式 Hugging Face および
## CLIPSegConfig

[[autodoc]] CLIPSegConfig
- from_text_vision_configs

## CLIPSegTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ja/model_doc/clvp.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ CLVP (Contrastive Language-Voice Pretrained Transformer) モデルは、James Be
## ClvpConfig

[[autodoc]] ClvpConfig
- from_sub_model_configs

## ClvpEncoderConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ko/model_doc/altclip.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ AltCLIP은 멀티모달 비전 및 언어 모델입니다. 이미지와 텍스
## AltCLIPConfig

[[autodoc]] AltCLIPConfig
- from_text_vision_configs

## AltCLIPTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ko/model_doc/blip-2.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ BLIP-2를 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티
## Blip2Config[[transformers.Blip2Config]]

[[autodoc]] Blip2Config
- from_vision_qformer_text_configs

## Blip2VisionConfig[[transformers.Blip2VisionConfig]]

Expand Down
1 change: 0 additions & 1 deletion docs/source/ko/model_doc/blip.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ BLIP은 여러 멀티모달 작업을 수행할 수 있는 모델입니다:
## BlipConfig[[transformers.BlipConfig]]

[[autodoc]] BlipConfig
- from_text_vision_configs

## BlipTextConfig[[transformers.BlipTextConfig]]

Expand Down
1 change: 0 additions & 1 deletion docs/source/ko/model_doc/clip.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,6 @@ CLIP을 시작하는 데 도움이 되는 Hugging Face와 community 자료 목
## CLIPConfig[[transformers.CLIPConfig]]

[[autodoc]] CLIPConfig
- from_text_vision_configs

## CLIPTextConfig[[transformers.CLIPTextConfig]]

Expand Down
1 change: 0 additions & 1 deletion docs/source/ko/model_doc/clipseg.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ CLIPSeg를 시작하는 데 도움이 될 Hugging Face 공식 자료와 커뮤
## CLIPSegConfig[[transformers.CLIPSegConfig]]

[[autodoc]] CLIPSegConfig
- from_text_vision_configs

## CLIPSegTextConfig[[transformers.CLIPSegTextConfig]]

Expand Down
1 change: 0 additions & 1 deletion docs/source/ko/model_doc/siglip.md
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,6 @@ PyTorch는 `torch.nn.functional`의 일부로 스케일된 점곱 어텐션(SDPA
## SiglipConfig

[[autodoc]] SiglipConfig
- from_text_vision_configs

## SiglipTextConfig

Expand Down
1 change: 0 additions & 1 deletion docs/source/ko/model_doc/xclip.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ X-CLIP을 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티
## XCLIPConfig[[xclipconfig]]

[[autodoc]] XCLIPConfig
- from_text_vision_configs

## XCLIPTextConfig[[xcliptextconfig]]

Expand Down
42 changes: 3 additions & 39 deletions src/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,11 +1142,11 @@ def _get_non_default_generation_parameters(self) -> dict[str, Any]:
non_default_generation_parameters = {}
decoder_attribute_name = None

# Composite models don't have a default config, use their decoder config as a fallback for default values
# Some composite models don't have a default config, use their decoder config as a fallback for default values
# If no known pattern is matched, then `default_config = None` -> check against the global generation defaults
try:
if not self.has_no_defaults_at_init:
default_config = self.__class__()
except ValueError:
else:
decoder_config = self.get_text_config(decoder=True)
if decoder_config is not self:
default_config = decoder_config.__class__()
Expand Down Expand Up @@ -1257,42 +1257,6 @@ def get_text_config(self, decoder=None, encoder=None) -> "PretrainedConfig":

return config_to_return

@classmethod
def from_text_vision_configs(cls, text_config, vision_config, **kwargs):
r"""
Instantiate a model config (or a derived class) from text model configuration and vision model
configuration.

Returns:
[`PreTrainedConfig`]: An instance of a configuration object
"""

warnings.warn(
"The `from_text_vision_configs` method is deprecated and will be removed in v4.60 of Transformers. Please instantiate "
"the config class directly with `MyConfig(text_config=text_config, vision_config=vision_config, **kwargs)` instead.",
FutureWarning,
)

return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

@classmethod
def from_text_audio_configs(cls, text_config, audio_config, **kwargs):
r"""
Instantiate a model config (or a derived class) from text model configuration and audio model
configuration.

Returns:
[`PreTrainedConfig`]: An instance of a configuration object
"""

warnings.warn(
"The `from_text_audio_configs` method is deprecated and will be removed in v4.60 of Transformers. Please instantiate "
"the config class directly with `MyConfig(text_config=text_config, audio_config=audio_config, **kwargs)` instead.",
FutureWarning,
)

return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)


def get_configuration_file(configuration_files: list[str]) -> str:
"""
Expand Down
22 changes: 13 additions & 9 deletions src/transformers/models/aimv2/configuration_aimv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,21 +264,25 @@ class Aimv2Config(PretrainedConfig):
def __init__(
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
):
super().__init__(**kwargs)

self.projection_dim = projection_dim
self.logit_scale_init_value = logit_scale_init_value
self.max_logit_scale = 100.0
if text_config is None:
text_config = {}
text_config = Aimv2TextConfig()
logger.info("`text_config` is `None`. Initializing the `Aimv2TextConfig` with default values.")
elif isinstance(text_config, dict):
text_config = Aimv2TextConfig(**text_config)

if vision_config is None:
vision_config = {}
vision_config = Aimv2VisionConfig()
logger.info("`vision_config` is `None`. initializing the `Aimv2VisionConfig` with default values.")
elif isinstance(vision_config, dict):
vision_config = Aimv2VisionConfig(**vision_config)

self.text_config = Aimv2TextConfig(**text_config)
self.vision_config = Aimv2VisionConfig(**vision_config)
self.projection_dim = projection_dim
self.logit_scale_init_value = logit_scale_init_value
self.max_logit_scale = 100.0
self.text_config = text_config
self.vision_config = vision_config

super().__init__(**kwargs)


__all__ = ["Aimv2Config", "Aimv2VisionConfig", "Aimv2TextConfig"]
2 changes: 1 addition & 1 deletion src/transformers/models/aimv2/modular_aimv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,10 @@ class Aimv2Config(SiglipConfig):
def __init__(
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
):
super().__init__(text_config, vision_config, **kwargs)
self.projection_dim = projection_dim
self.logit_scale_init_value = logit_scale_init_value
self.max_logit_scale = 100.0
super().__init__(text_config, vision_config, **kwargs)

del self.initializer_factor

Expand Down
Loading