From 11322dbe5dd375d4181ab8aca03d0656e2bd4dae Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 16:48:15 +0200 Subject: [PATCH 1/9] fix --- src/transformers/models/clip/modeling_clip.py | 56 ++++++------------- 1 file changed, 17 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 3ed174ed11aa..86a5368ddb35 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -22,7 +22,7 @@ from torch import nn from ...activations import ACT2FN -from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask +from ...masking_utils import create_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel @@ -303,8 +303,8 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, + **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -317,15 +317,6 @@ def forward( queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) - # CLIP text model uses both `causal_attention_mask` and `attention_mask` - # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask` - if self.config._attn_implementation == "flash_attention_2": - self.is_causal = causal_attention_mask is not None - else: - if attention_mask is not None and causal_attention_mask is not None: - attention_mask = attention_mask + causal_attention_mask - elif causal_attention_mask is not None: - attention_mask = causal_attention_mask attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": @@ -337,10 +328,9 @@ def forward( keys, values, attention_mask, - is_causal=self.is_causal, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, - output_attentions=output_attentions, + **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() @@ -379,8 +369,8 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, - causal_attention_mask: torch.Tensor, output_attentions: Optional[bool] = False, + **kwargs, ) -> tuple[torch.FloatTensor]: """ Args: @@ -398,8 +388,8 @@ def forward( hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, - causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -425,7 +415,7 @@ class CLIPPreTrainedModel(PreTrainedModel): _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True - _supports_attention_backend = True + _supports_attention_backend = False # kwargs are not supported throughout all modules def _init_weights(self, module): """Initialize the weights""" @@ -503,9 +493,9 @@ def forward( self, inputs_embeds, attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + **kwargs, ) -> BaseModelOutput: r""" Args: @@ -519,13 +509,6 @@ def forward( - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Causal mask for the text model. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under @@ -545,14 +528,15 @@ def forward( all_attentions = () if output_attentions else None hidden_states = inputs_embeds - for idx, encoder_layer in enumerate(self.layers): + for encoder_layer in self.layers: if output_hidden_states: encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( hidden_states, attention_mask, - causal_attention_mask, output_attentions=output_attentions, + **kwargs, ) hidden_states = layer_outputs[0] @@ -604,23 +588,20 @@ def forward( hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) - # CLIP's text model uses causal mask, prepare it here. - # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 - causal_attention_mask = _create_4d_causal_attention_mask( - input_shape, hidden_states.dtype, device=hidden_states.device + attention_mask = create_causal_mask( + config=self.config, + input_embeds=hidden_states, + attention_mask=attention_mask, + cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device), + past_key_values=None, ) - # expand attention_mask - if attention_mask is not None and self.config._attn_implementation != "flash_attention_2": - # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len] - attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, - causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + is_causal=True, ) last_hidden_state = encoder_outputs.last_hidden_state @@ -666,7 +647,6 @@ class CLIPTextModel(CLIPPreTrainedModel): input_modalities = "text" _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] - _supports_flash_attn = False # mask creation only accounts for sdpa/eager def __init__(self, config: CLIPTextConfig): super().__init__(config) @@ -825,7 +805,6 @@ def forward( class CLIPModel(CLIPPreTrainedModel): config: CLIPConfig _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"] - _supports_flash_attn = False # mask creation only accounts for sdpa/eager def __init__(self, config: CLIPConfig): super().__init__(config) @@ -1034,7 +1013,6 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel): config: CLIPTextConfig input_modalities = "text" - _supports_flash_attn = False _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] def __init__(self, config: CLIPTextConfig): From 764e63fa819d02d4e570e2529a36fd0050121114 Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 18:47:33 +0200 Subject: [PATCH 2/9] make kwargs fully passed and adjust with outputs xxx --- src/transformers/models/clip/modeling_clip.py | 177 ++++++------------ 1 file changed, 55 insertions(+), 122 deletions(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 86a5368ddb35..2e07370b9e25 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -26,7 +26,17 @@ from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...utils import ModelOutput, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs, logging, torch_int +from ...processing_utils import Unpack +from ...utils import ( + ModelOutput, + TransformersKwargs, + auto_docstring, + can_return_tuple, + filter_out_non_signature_kwargs, + logging, + torch_int, +) +from ...utils.generic import check_model_inputs from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig @@ -260,8 +270,7 @@ def eager_attention_forward( attention_mask: Optional[torch.Tensor], scaling: float, dropout: float = 0.0, - output_attentions: bool = True, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ): attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling if attention_mask is not None: @@ -271,8 +280,6 @@ def eager_attention_forward( attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() - if not output_attentions: - attn_weights = None return attn_output, attn_weights @@ -303,8 +310,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -333,11 +339,9 @@ def forward( **kwargs, ) - attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None return attn_output, attn_weights @@ -369,8 +373,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, - output_attentions: Optional[bool] = False, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.FloatTensor]: """ Args: @@ -378,17 +381,13 @@ def forward( attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. """ residual = hidden_states hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( + hidden_states, _ = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, - output_attentions=output_attentions, **kwargs, ) hidden_states = residual + hidden_states @@ -398,12 +397,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states @auto_docstring @@ -412,10 +406,15 @@ class CLIPPreTrainedModel(PreTrainedModel): base_model_prefix = "clip" input_modalities = ["image", "text"] supports_gradient_checkpointing = True + accepts_loss_kwargs = False _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True - _supports_attention_backend = False # kwargs are not supported throughout all modules + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": CLIPEncoderLayer, + "attentions": CLIPAttention, + } def _init_weights(self, module): """Initialize the weights""" @@ -487,15 +486,12 @@ def __init__(self, config: CLIPConfig): super().__init__() self.config = config self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False def forward( self, inputs_embeds, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutput: r""" Args: @@ -510,55 +506,27 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds for encoder_layer in self.layers: - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - layer_outputs = encoder_layer( + hidden_states = encoder_layer( hidden_states, attention_mask, - output_attentions=output_attentions, **kwargs, ) - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - return BaseModelOutput( last_hidden_state=hidden_states, - hidden_states=encoder_states, - attentions=all_attentions, ) -class CLIPTextTransformer(nn.Module): +class CLIPTextTransformer(CLIPPreTrainedModel): def __init__(self, config: CLIPTextConfig): - super().__init__() + super().__init__(config) self.config = config embed_dim = config.hidden_size + self.gradient_checkpointing = False + self.embeddings = CLIPTextEmbeddings(config) self.encoder = CLIPEncoder(config) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @@ -566,20 +534,15 @@ def __init__(self, config: CLIPTextConfig): # For `pooled_output` computation self.eos_token_id = config.eos_token_id + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - if input_ids is None: raise ValueError("You have to specify input_ids") @@ -596,12 +559,12 @@ def forward( past_key_values=None, ) + kwargs.pop("is_causal", None) encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, is_causal=True, + **kwargs, ) last_hidden_state = encoder_outputs.last_hidden_state @@ -632,8 +595,6 @@ def forward( return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -667,8 +628,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" Examples: @@ -690,35 +650,30 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) -class CLIPVisionTransformer(nn.Module): +class CLIPVisionTransformer(CLIPPreTrainedModel): def __init__(self, config: CLIPVisionConfig): - super().__init__() + super().__init__(config) self.config = config embed_dim = config.hidden_size + self.gradient_checkpointing = False self.embeddings = CLIPVisionEmbeddings(config) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) self.encoder = CLIPEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = False, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -727,8 +682,7 @@ def forward( encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) last_hidden_state = encoder_outputs.last_hidden_state @@ -738,8 +692,6 @@ def forward( return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -768,9 +720,8 @@ def get_input_embeddings(self) -> nn.Module: def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" Example: @@ -795,9 +746,8 @@ def forward( return self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, ) @@ -926,9 +876,8 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, return_loss: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ) -> CLIPOutput: r""" return_loss (`bool`, *optional*): @@ -956,25 +905,17 @@ def forward( >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```""" - # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, ) text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) image_embeds = vision_outputs.pooler_output @@ -1039,8 +980,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> CLIPTextModelOutput: r""" Examples: @@ -1063,8 +1003,8 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + return_dict=True, + **kwargs, ) pooled_output = text_outputs.pooler_output text_embeds = self.text_projection(pooled_output) @@ -1102,9 +1042,8 @@ def get_input_embeddings(self) -> nn.Module: def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ) -> CLIPVisionModelOutput: r""" Examples: @@ -1129,9 +1068,9 @@ def forward( vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=True, + **kwargs, ) pooled_output = vision_outputs.pooler_output image_embeds = self.visual_projection(pooled_output) @@ -1175,8 +1114,7 @@ def forward( self, pixel_values: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> ImageClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1184,15 +1122,10 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + return_dict=True, + **kwargs, ) sequence_output = outputs.last_hidden_state From 9132b3b47ed448d2cafdcb21b5bef2517837822d Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 19:09:47 +0200 Subject: [PATCH 3/9] propogate metaclip 2 --- .../models/metaclip_2/modeling_metaclip_2.py | 272 ++++++------------ .../models/metaclip_2/modular_metaclip_2.py | 82 +++--- 2 files changed, 120 insertions(+), 234 deletions(-) diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index c01b97d70bf6..ba9dd9ac30a2 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -12,7 +12,7 @@ from torch import nn from ...activations import ACT2FN -from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask +from ...masking_utils import create_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel @@ -160,8 +160,7 @@ def eager_attention_forward( attention_mask: Optional[torch.Tensor], scaling: float, dropout: float = 0.0, - output_attentions: bool = True, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ): attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling if attention_mask is not None: @@ -171,8 +170,6 @@ def eager_attention_forward( attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() - if not output_attentions: - attn_weights = None return attn_output, attn_weights @@ -203,8 +200,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -217,15 +213,6 @@ def forward( queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) - # METACLIP_2 text model uses both `causal_attention_mask` and `attention_mask` - # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask` - if self.config._attn_implementation == "flash_attention_2": - self.is_causal = causal_attention_mask is not None - else: - if attention_mask is not None and causal_attention_mask is not None: - attention_mask = attention_mask + causal_attention_mask - elif causal_attention_mask is not None: - attention_mask = causal_attention_mask attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": @@ -237,17 +224,14 @@ def forward( keys, values, attention_mask, - is_causal=self.is_causal, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, - output_attentions=output_attentions, + **kwargs, ) - attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None return attn_output, attn_weights @@ -266,16 +250,61 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states +class MetaClip2EncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Union[MetaClip2VisionConfig, MetaClip2TextConfig]): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = MetaClip2Attention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = MetaClip2MLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + **kwargs, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + @auto_docstring class MetaClip2PreTrainedModel(PreTrainedModel): config: MetaClip2Config base_model_prefix = "metaclip_2" input_modalities = ["image", "text"] supports_gradient_checkpointing = True + accepts_loss_kwargs = False _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": MetaClip2EncoderLayer, + "attentions": MetaClip2Attention, + } def _init_weights(self, module): """Initialize the weights""" @@ -334,56 +363,6 @@ def _init_weights(self, module): module.bias.data.zero_() -class MetaClip2EncoderLayer(GradientCheckpointingLayer): - def __init__(self, config: Union[MetaClip2VisionConfig, MetaClip2TextConfig]): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = MetaClip2Attention(config) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = MetaClip2MLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - causal_attention_mask: torch.Tensor, - output_attentions: Optional[bool] = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, - ) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs - - class MetaClip2Encoder(nn.Module): """ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a @@ -397,15 +376,12 @@ def __init__(self, config: MetaClip2Config): super().__init__() self.config = config self.layers = nn.ModuleList([MetaClip2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False def forward( self, inputs_embeds, attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutput: r""" Args: @@ -420,61 +396,27 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Causal mask for the text model. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds - for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - layer_outputs = encoder_layer( + for encoder_layer in self.layers: + hidden_states = encoder_layer( hidden_states, attention_mask, - causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - return BaseModelOutput( last_hidden_state=hidden_states, - hidden_states=encoder_states, - attentions=all_attentions, ) -class MetaClip2TextTransformer(nn.Module): +class MetaClip2TextTransformer(MetaClip2PreTrainedModel): def __init__(self, config: MetaClip2TextConfig): - super().__init__() + super().__init__(config) self.config = config embed_dim = config.hidden_size + self.gradient_checkpointing = False + self.embeddings = MetaClip2TextEmbeddings(config) self.encoder = MetaClip2Encoder(config) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @@ -489,7 +431,6 @@ def forward( input_ids, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: input_shape = input_ids.size() @@ -497,21 +438,19 @@ def forward( hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) - # CLIP's text model uses causal mask, prepare it here. - # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 - causal_attention_mask = _create_4d_causal_attention_mask( - input_shape, hidden_states.dtype, device=hidden_states.device + attention_mask = create_causal_mask( + config=self.config, + input_embeds=hidden_states, + attention_mask=attention_mask, + cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device), + past_key_values=None, ) - # expand attention_mask - if attention_mask is not None and self.config._attn_implementation != "flash_attention_2": - # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len] - attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - + kwargs.pop("is_causal", None) encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, - causal_attention_mask=causal_attention_mask, + is_causal=True, **kwargs, ) @@ -572,7 +511,6 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel): input_modalities = "text" _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] - _supports_flash_attn = False # mask creation only accounts for sdpa/eager def __init__(self, config: MetaClip2TextConfig): super().__init__(config) @@ -593,8 +531,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" Examples: @@ -616,8 +553,7 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) @@ -674,7 +610,6 @@ class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel): config: MetaClip2TextConfig input_modalities = "text" - _supports_flash_attn = False _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] def __init__(self, config: MetaClip2TextConfig): @@ -701,8 +636,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> MetaClip2TextModelOutput: r""" Examples: @@ -723,8 +657,8 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + return_dict=True, + **kwargs, ) pooled_output = text_outputs.pooler_output text_embeds = self.text_projection(pooled_output) @@ -837,7 +771,6 @@ class MetaClip2Model(MetaClip2PreTrainedModel): config: MetaClip2Config _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer", "MetaClip2VisionEmbeddings"] - _supports_flash_attn = False # mask creation only accounts for sdpa/eager def __init__(self, config: MetaClip2Config): super().__init__(config) @@ -881,8 +814,6 @@ def get_text_features( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -915,8 +846,6 @@ def get_text_features( def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, ) -> torch.FloatTensor: r""" @@ -959,9 +888,8 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, return_loss: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ) -> MetaClip2Output: r""" return_loss (`bool`, *optional*): @@ -988,25 +916,17 @@ def forward( >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```""" - # Use METACLIP_2 model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, ) text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) image_embeds = vision_outputs.pooler_output @@ -1040,30 +960,26 @@ def forward( ) -class MetaClip2VisionTransformer(nn.Module): +class MetaClip2VisionTransformer(MetaClip2PreTrainedModel): def __init__(self, config: MetaClip2VisionConfig): - super().__init__() + super().__init__(config) self.config = config embed_dim = config.hidden_size + self.gradient_checkpointing = False self.embeddings = MetaClip2VisionEmbeddings(config) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) self.encoder = MetaClip2Encoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = False, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1072,8 +988,7 @@ def forward( encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) last_hidden_state = encoder_outputs.last_hidden_state @@ -1083,8 +998,6 @@ def forward( return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -1149,9 +1062,8 @@ def get_input_embeddings(self) -> nn.Module: def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1176,9 +1088,8 @@ def forward( return self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, ) @@ -1260,9 +1171,8 @@ def get_input_embeddings(self) -> nn.Module: def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ) -> MetaClip2VisionModelOutput: r""" Examples: @@ -1286,9 +1196,9 @@ def forward( vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=True, + **kwargs, ) pooled_output = vision_outputs.pooler_output image_embeds = self.visual_projection(pooled_output) @@ -1332,8 +1242,7 @@ def forward( self, pixel_values: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> ImageClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1341,15 +1250,10 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + return_dict=True, + **kwargs, ) sequence_output = outputs.last_hidden_state diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index 6b80e73ef8a8..790f81e2a1fa 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -3,11 +3,10 @@ import torch from torch import nn -from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask +from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling -from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging from ...utils.generic import check_model_inputs from ..clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig from ..clip.modeling_clip import ( @@ -15,6 +14,7 @@ CLIPAttention, CLIPForImageClassification, CLIPModel, + CLIPPreTrainedModel, CLIPTextEmbeddings, CLIPTextModel, CLIPTextModelWithProjection, @@ -214,15 +214,8 @@ class MetaClip2MLP(CLIPMLP): @auto_docstring -class MetaClip2PreTrainedModel(PreTrainedModel): - config: MetaClip2Config +class MetaClip2PreTrainedModel(CLIPPreTrainedModel): base_model_prefix = "metaclip_2" - input_modalities = ["image", "text"] - supports_gradient_checkpointing = True - _supports_sdpa = True - _supports_flash_attn = True - _supports_flex_attn = True - _supports_attention_backend = True def _init_weights(self, module): """Initialize the weights""" @@ -289,7 +282,6 @@ def forward( input_ids, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: input_shape = input_ids.size() @@ -297,21 +289,19 @@ def forward( hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) - # CLIP's text model uses causal mask, prepare it here. - # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 - causal_attention_mask = _create_4d_causal_attention_mask( - input_shape, hidden_states.dtype, device=hidden_states.device + attention_mask = create_causal_mask( + config=self.config, + input_embeds=hidden_states, + attention_mask=attention_mask, + cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device), + past_key_values=None, ) - # expand attention_mask - if attention_mask is not None and self.config._attn_implementation != "flash_attention_2": - # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len] - attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - + kwargs.pop("is_causal", None) encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, - causal_attention_mask=causal_attention_mask, + is_causal=True, **kwargs, ) @@ -369,13 +359,14 @@ def __init__(self, config: MetaClip2TextConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple + @auto_docstring def forward( self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ): r""" Examples: @@ -396,8 +387,7 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) @@ -443,13 +433,14 @@ def __init__(self, config: MetaClip2TextConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple + @auto_docstring def forward( self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ): r""" Examples: @@ -469,8 +460,7 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) @@ -534,6 +524,8 @@ def __init__(self, config: MetaClip2Config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple + @auto_docstring def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -541,9 +533,8 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, return_loss: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ): r""" return_loss (`bool`, *optional*): @@ -576,9 +567,8 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, return_loss=return_loss, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, ) def get_text_features( @@ -586,8 +576,6 @@ def get_text_features( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, ): r""" Returns: @@ -609,15 +597,11 @@ def get_text_features( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, ) def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, ): r""" @@ -644,8 +628,6 @@ def get_image_features( ```""" return super().get_image_features( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, ) @@ -687,12 +669,13 @@ class MetaClip2VisionModel(CLIPVisionModel): >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" + @can_return_tuple + @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ): r""" Examples: @@ -716,9 +699,8 @@ def forward( ```""" return super().forward( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, ) @@ -758,12 +740,13 @@ class MetaClip2VisionModelWithProjection(CLIPVisionModelWithProjection): >>> image_embeds = outputs.image_embeds ```""" + @can_return_tuple + @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], ): r""" Examples: @@ -786,9 +769,8 @@ def forward( ```""" return super().forward( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, ) From d025c56d0e0355fa0c09ea5bfc3c518a371534f5 Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 19:46:52 +0200 Subject: [PATCH 4/9] propogate mlcd and fix test --- src/transformers/models/mlcd/modeling_mlcd.py | 192 ++++++------------ src/transformers/models/mlcd/modular_mlcd.py | 187 ++++++----------- tests/models/mlcd/test_modeling_mlcd.py | 2 +- 3 files changed, 133 insertions(+), 248 deletions(-) diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 76414a21225f..919fa27cd61a 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -25,12 +25,12 @@ import torch.nn as nn from ...activations import ACT2FN -from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, torch_int +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_int +from ...utils.generic import check_model_inputs from .configuration_mlcd import MLCDVisionConfig @@ -259,7 +259,7 @@ def forward( hidden_states: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, - **kwargs: Unpack[FlashAttentionKwargs], + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" batch_size, seq_length = hidden_states.shape[:-1] @@ -316,7 +316,7 @@ def forward( hidden_states: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.FloatTensor]: """ Args: @@ -328,18 +328,15 @@ def forward( Represents absolute positional embeddings for the query and key in the attention mechanism. attention_mask (`torch.FloatTensor`): Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. """ residual = hidden_states hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( + hidden_states, _ = self.self_attn( hidden_states=hidden_states, position_embeddings=position_embeddings, attention_mask=attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -348,12 +345,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states class MLCDEncoder(nn.Module): @@ -370,16 +362,13 @@ def __init__(self, config: MLCDVisionConfig): super().__init__() self.config = config self.layers = nn.ModuleList([MLCDEncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False def forward( self, inputs_embeds: torch.FloatTensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple, BaseModelOutput]: r""" Args: @@ -395,58 +384,74 @@ def forward( - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds - for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - layer_outputs = encoder_layer( + for encoder_layer in self.layers: + hidden_states = encoder_layer( hidden_states=hidden_states, position_embeddings=position_embeddings, attention_mask=attention_mask, - output_attentions=output_attentions, + **kwargs, ) - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, - hidden_states=encoder_states, - attentions=all_attentions, ) -class MLCDVisionTransformer(nn.Module): +@auto_docstring +class MLCDPreTrainedModel(PreTrainedModel): + config: MLCDVisionConfig + base_model_prefix = "mlcd" + supports_gradient_checkpointing = True + accepts_loss_kwargs = False + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": MLCDEncoderLayer, + "attentions": MLCDAttention, + } + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor + if isinstance(module, MLCDVisionEmbeddings): + factor = self.config.initializer_factor + nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) + nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) + elif isinstance(module, MLCDAttention): + factor = self.config.initializer_factor + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (module.embed_dim**-0.5) * factor + nn.init.normal_(module.q_proj.weight, std=in_proj_std) + nn.init.normal_(module.k_proj.weight, std=in_proj_std) + nn.init.normal_(module.v_proj.weight, std=in_proj_std) + nn.init.normal_(module.out_proj.weight, std=out_proj_std) + elif isinstance(module, MLCDMLP): + factor = self.config.initializer_factor + in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + fc_std = (2 * module.config.hidden_size) ** -0.5 * factor + nn.init.normal_(module.fc1.weight, std=fc_std) + nn.init.normal_(module.fc2.weight, std=in_proj_std) + elif isinstance(module, MLCDVisionTransformer): + factor = self.config.initializer_factor + pos_emb_std = (module.config.hidden_size // module.config.num_attention_heads // 2) ** -0.5 * factor + nn.init.normal_(module.class_pos_emb, mean=0.0, std=pos_emb_std) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class MLCDVisionTransformer(MLCDPreTrainedModel): def __init__(self, config: MLCDVisionConfig): - super().__init__() + super().__init__(config) self.config = config embed_dim = config.hidden_size + self.gradient_checkpointing = False self.embeddings = MLCDVisionEmbeddings(config) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @@ -455,20 +460,13 @@ def __init__(self, config: MLCDVisionConfig): self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2) self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2)) + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple, BaseModelOutputWithPooling]: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -486,66 +484,19 @@ def forward( encoder_outputs = self.encoder( inputs_embeds=hidden_states, position_embeddings=position_embeddings, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) -@auto_docstring -class MLCDPreTrainedModel(PreTrainedModel): - config: MLCDVisionConfig - base_model_prefix = "mlcd" - supports_gradient_checkpointing = True - _supports_flash_attn = True - _supports_sdpa = True - - def _init_weights(self, module): - """Initialize the weights""" - factor = self.config.initializer_factor - if isinstance(module, MLCDVisionEmbeddings): - factor = self.config.initializer_factor - nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) - nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) - elif isinstance(module, MLCDAttention): - factor = self.config.initializer_factor - in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - out_proj_std = (module.embed_dim**-0.5) * factor - nn.init.normal_(module.q_proj.weight, std=in_proj_std) - nn.init.normal_(module.k_proj.weight, std=in_proj_std) - nn.init.normal_(module.v_proj.weight, std=in_proj_std) - nn.init.normal_(module.out_proj.weight, std=out_proj_std) - elif isinstance(module, MLCDMLP): - factor = self.config.initializer_factor - in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - fc_std = (2 * module.config.hidden_size) ** -0.5 * factor - nn.init.normal_(module.fc1.weight, std=fc_std) - nn.init.normal_(module.fc2.weight, std=in_proj_std) - elif isinstance(module, MLCDVisionTransformer): - factor = self.config.initializer_factor - pos_emb_std = (module.config.hidden_size // module.config.num_attention_heads // 2) ** -0.5 * factor - nn.init.normal_(module.class_pos_emb, mean=0.0, std=pos_emb_std) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - elif isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - @auto_docstring( custom_intro=""" The vision model from M_L_C_D without any head or projection on top. @@ -566,13 +517,12 @@ def __init__(self, config: MLCDVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple, BaseModelOutputWithPooling]: r""" Example: @@ -596,17 +546,9 @@ def forward( >>> print(f"Number of attention layers: {len(outputs.attentions)}") >>> print(f"Attention shape: {outputs.attentions[0].shape}") ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - return self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index ce85a149b594..2d94e37585d7 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -19,11 +19,11 @@ import torch.nn as nn from ...configuration_utils import PreTrainedConfig -from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import auto_docstring, logging +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.generic import check_model_inputs from ..clip.modeling_clip import ( CLIPMLP, CLIPAttention, @@ -206,7 +206,7 @@ def forward( hidden_states: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, - **kwargs: Unpack[FlashAttentionKwargs], + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: batch_size, seq_length = hidden_states.shape[:-1] @@ -258,7 +258,7 @@ def forward( hidden_states: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.FloatTensor]: """ Args: @@ -270,18 +270,15 @@ def forward( Represents absolute positional embeddings for the query and key in the attention mechanism. attention_mask (`torch.FloatTensor`): Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. """ residual = hidden_states hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( + hidden_states, _ = self.self_attn( hidden_states=hidden_states, position_embeddings=position_embeddings, attention_mask=attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -290,12 +287,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states class MLCDEncoder(CLIPEncoder): @@ -316,9 +308,7 @@ def forward( inputs_embeds: torch.FloatTensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple, BaseModelOutput]: r""" Args: @@ -334,107 +324,18 @@ def forward( - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds - for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - layer_outputs = encoder_layer( + for encoder_layer in self.layers: + hidden_states = encoder_layer( hidden_states=hidden_states, position_embeddings=position_embeddings, attention_mask=attention_mask, - output_attentions=output_attentions, + **kwargs, ) - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, - hidden_states=encoder_states, - attentions=all_attentions, - ) - - -class MLCDVisionTransformer(CLIPVisionTransformer): - def __init__(self, config: MLCDVisionConfig): - super().__init__(config) - self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2) - self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2)) - - @auto_docstring - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[tuple, BaseModelOutputWithPooling]: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - num_patches_height = pixel_values.shape[-2] // self.config.patch_size - num_patches_width = pixel_values.shape[-1] // self.config.patch_size - rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width) - rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device) - rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0) - emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) - position_embeddings = (emb.cos(), emb.sin()) - - hidden_states = self.embeddings(pixel_values) - hidden_states = self.pre_layrnorm(hidden_states) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - position_embeddings=position_embeddings, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - pooled_output = last_hidden_state[:, 0, :] - pooled_output = self.post_layernorm(pooled_output) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -443,8 +344,15 @@ class MLCDPreTrainedModel(PreTrainedModel): config: MLCDVisionConfig base_model_prefix = "mlcd" supports_gradient_checkpointing = True + accepts_loss_kwargs = False _supports_flash_attn = True _supports_sdpa = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": MLCDEncoderLayer, + "attentions": MLCDAttention, + } def _init_weights(self, module): """Initialize the weights""" @@ -478,14 +386,57 @@ def _init_weights(self, module): module.bias.data.zero_() +class MLCDVisionTransformer(CLIPVisionTransformer): + def __init__(self, config: MLCDVisionConfig): + super().__init__(config) + self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2) + self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2)) + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutputWithPooling]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + num_patches_height = pixel_values.shape[-2] // self.config.patch_size + num_patches_width = pixel_values.shape[-1] // self.config.patch_size + rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width) + rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device) + rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + position_embeddings = (emb.cos(), emb.sin()) + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + position_embeddings=position_embeddings, + **kwargs, + ) + + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + ) + + class MLCDVisionModel(CLIPVisionModel): + + @can_return_tuple @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple, BaseModelOutputWithPooling]: r""" Example: @@ -509,17 +460,9 @@ def forward( >>> print(f"Number of attention layers: {len(outputs.attentions)}") >>> print(f"Attention shape: {outputs.attentions[0].shape}") ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - return self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) diff --git a/tests/models/mlcd/test_modeling_mlcd.py b/tests/models/mlcd/test_modeling_mlcd.py index c88ab4f3aa8d..c3c9109ed5e1 100644 --- a/tests/models/mlcd/test_modeling_mlcd.py +++ b/tests/models/mlcd/test_modeling_mlcd.py @@ -146,7 +146,7 @@ class MLCDVisionModelIntegrationTest(unittest.TestCase): @slow def test_inference(self): model_name = "DeepGlint-AI/mlcd-vit-bigG-patch14-448" - model = MLCDVisionModel.from_pretrained(model_name).to(torch_device) + model = MLCDVisionModel.from_pretrained(model_name, attn_implementation="eager").to(torch_device) processor = AutoProcessor.from_pretrained(model_name) # process single image From 47bb91e2523aeca23c1c7d643ed35eda96277c9b Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 19:48:23 +0200 Subject: [PATCH 5/9] style --- src/transformers/models/mlcd/modular_mlcd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index 2d94e37585d7..56b15a0de609 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -430,7 +430,6 @@ def forward( class MLCDVisionModel(CLIPVisionModel): - @can_return_tuple @auto_docstring def forward( From 94d916f8032255b3bb2b545938dbdb174a44937f Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 20:14:24 +0200 Subject: [PATCH 6/9] fix repo consistency, need to add ignore rules as those are building blocks --- .../models/metaclip_2/modeling_metaclip_2.py | 2 ++ .../models/metaclip_2/modular_metaclip_2.py | 2 ++ src/transformers/models/mlcd/modeling_mlcd.py | 2 +- src/transformers/models/mlcd/modular_mlcd.py | 1 + utils/check_repo.py | 10 ++++++++++ 5 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index ba9dd9ac30a2..d0db71812e21 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -1280,7 +1280,9 @@ def forward( "MetaClip2PreTrainedModel", "MetaClip2TextModel", "MetaClip2TextModelWithProjection", + "MetaClip2TextTransformer", "MetaClip2VisionModel", "MetaClip2VisionModelWithProjection", + "MetaClip2VisionTransformer", "MetaClip2ForImageClassification", ] diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index 790f81e2a1fa..2141ad80e672 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -786,7 +786,9 @@ class MetaClip2ForImageClassification(CLIPForImageClassification): "MetaClip2PreTrainedModel", "MetaClip2TextModel", "MetaClip2TextModelWithProjection", + "MetaClip2TextTransformer", "MetaClip2VisionModel", "MetaClip2VisionModelWithProjection", + "MetaClip2VisionTransformer", # noqa: F822 "MetaClip2ForImageClassification", ] diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 919fa27cd61a..09423413e751 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -552,4 +552,4 @@ def forward( ) -__all__ = ["MLCDPreTrainedModel", "MLCDVisionModel"] +__all__ = ["MLCDPreTrainedModel", "MLCDVisionTransformer", "MLCDVisionModel"] diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index 56b15a0de609..6dd065c16373 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -468,5 +468,6 @@ def forward( __all__ = [ "MLCDVisionConfig", "MLCDPreTrainedModel", + "MLCDVisionTransformer", "MLCDVisionModel", ] diff --git a/utils/check_repo.py b/utils/check_repo.py index 17f18659bd08..e6ee1dbdb77d 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -191,6 +191,11 @@ "BltLocalDecoder", # Building part of bigger (tested) model. Tested implicitly through BLTForCausalLM. "BltGlobalTransformer", # Building part of bigger (tested) model. Tested implicitly through BLTForCausalLM. "Florence2VisionBackbone", # Building part of bigger (tested) model. Tested implicitly through Florence2ForConditionalGeneration. + "CLIPTextTransformer", # Building part of bigger (tested) model. Tested implicitly through CLIPTextModel. + "CLIPVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through CLIPVisionModel. + "MetaClip2TextTransformer", # Building part of bigger (tested) model. Tested implicitly through MetaClip2TextModel. + "MetaClip2VisionTransformer", # Building part of bigger (tested) model. Tested implicitly through MetaClip2VisionModel. + "MLCDVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through MLCDVisionModel. ] ) @@ -393,6 +398,11 @@ "Qwen3OmniMoeTalkerModel", # Building part of a bigger model "Qwen3OmniMoeThinkerForConditionalGeneration", # Building part of a bigger model "Qwen3OmniMoeThinkerTextModel", # Building part of a bigger model + "CLIPTextTransformer", # Building part of bigger (tested) model. Tested implicitly through CLIPTextModel. + "CLIPVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through CLIPVisionModel. + "MetaClip2TextTransformer", # Building part of bigger (tested) model. Tested implicitly through MetaClip2TextModel. + "MetaClip2VisionTransformer", # Building part of bigger (tested) model. Tested implicitly through MetaClip2VisionModel. + "MLCDVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through MLCDVisionModel. ] From 4edb4e260f642ceb51195ed25cb626eed5da8f91 Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 20:16:37 +0200 Subject: [PATCH 7/9] style --- docs/source/en/model_doc/clip.md | 10 + docs/source/en/model_doc/metaclip_2.md | 10 + docs/source/en/model_doc/mlcd.md | 5 + src/transformers/models/clip/modeling_clip.py | 2 + utils/check_repo.py | 371 +++++++++--------- 5 files changed, 214 insertions(+), 184 deletions(-) diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md index 529194d32a37..1d26e91ddb28 100644 --- a/docs/source/en/model_doc/clip.md +++ b/docs/source/en/model_doc/clip.md @@ -129,6 +129,11 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_ - get_text_features - get_image_features +## CLIPTextTransformer + +[[autodoc]] CLIPTextTransformer + - forward + ## CLIPTextModel [[autodoc]] CLIPTextModel @@ -144,6 +149,11 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_ [[autodoc]] CLIPVisionModelWithProjection - forward +## CLIPVisionTransformer + +[[autodoc]] CLIPVisionTransformer + - forward + ## CLIPVisionModel [[autodoc]] CLIPVisionModel diff --git a/docs/source/en/model_doc/metaclip_2.md b/docs/source/en/model_doc/metaclip_2.md index ce17459b8d85..d27b828ee94b 100644 --- a/docs/source/en/model_doc/metaclip_2.md +++ b/docs/source/en/model_doc/metaclip_2.md @@ -104,6 +104,11 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_ - get_text_features - get_image_features +## MetaClip2TextTransformer + +[[autodoc]] MetaClip2TextTransformer + - forward + ## MetaClip2TextModel [[autodoc]] MetaClip2TextModel @@ -119,6 +124,11 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_ [[autodoc]] MetaClip2VisionModelWithProjection - forward +## MetaClip2VisionTransformer + +[[autodoc]] MetaClip2VisionTransformer + - forward + ## MetaClip2VisionModel [[autodoc]] MetaClip2VisionModel diff --git a/docs/source/en/model_doc/mlcd.md b/docs/source/en/model_doc/mlcd.md index 7ff2fb434da0..4822fd7df01e 100644 --- a/docs/source/en/model_doc/mlcd.md +++ b/docs/source/en/model_doc/mlcd.md @@ -77,5 +77,10 @@ print(f"Extracted features shape: {features.shape}") ## MLCDVisionModel +[[autodoc]] MLCDVisionTransformer + - forward + +## MLCDVisionModel + [[autodoc]] MLCDVisionModel - forward diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 2e07370b9e25..e8a3ff3e125b 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -1151,8 +1151,10 @@ def forward( "CLIPModel", "CLIPPreTrainedModel", "CLIPTextModel", + "CLIPTextTransformer", "CLIPTextModelWithProjection", "CLIPVisionModel", + "CLIPVisionTransformer", "CLIPVisionModelWithProjection", "CLIPForImageClassification", ] diff --git a/utils/check_repo.py b/utils/check_repo.py index e6ee1dbdb77d..e641daeea319 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -220,190 +220,193 @@ # Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and # should **not** be the rule. -IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ - # models to ignore for model xxx mapping - "Aimv2TextModel", - "AlignTextModel", - "AlignVisionModel", - "ClapTextModel", - "ClapTextModelWithProjection", - "ClapAudioModel", - "ClapAudioModelWithProjection", - "Blip2TextModelWithProjection", - "Blip2VisionModelWithProjection", - "Blip2VisionModel", - "ErnieMForInformationExtraction", - "FastSpeech2ConformerHifiGan", - "FastSpeech2ConformerWithHifiGan", - "GitVisionModel", - "GraphormerModel", - "GraphormerForGraphClassification", - "BlipForImageTextRetrieval", - "BlipForQuestionAnswering", - "BlipVisionModel", - "BlipTextLMHeadModel", - "BlipTextModel", - "BrosSpadeEEForTokenClassification", - "BrosSpadeELForTokenClassification", - "Swin2SRForImageSuperResolution", - "BridgeTowerForImageAndTextRetrieval", - "BridgeTowerForMaskedLM", - "BridgeTowerForContrastiveLearning", - "CLIPSegForImageSegmentation", - "CLIPSegVisionModel", - "CLIPSegTextModel", - "EsmForProteinFolding", - "GPTSanJapaneseModel", - "TimeSeriesTransformerForPrediction", - "InformerForPrediction", - "AutoformerForPrediction", - "PatchTSTForPretraining", - "PatchTSTForPrediction", - "JukeboxVQVAE", - "JukeboxPrior", - "SamModel", - "Sam2Model", - "Sam2VideoModel", - "EdgeTamModel", - "EdgeTamVideoModel", - "SamHQModel", - "DPTForDepthEstimation", - "DecisionTransformerGPT2Model", - "GLPNForDepthEstimation", - "ViltForImagesAndTextClassification", - "ViltForImageAndTextRetrieval", - "ViltForTokenClassification", - "ViltForMaskedLM", - "PerceiverForMultimodalAutoencoding", - "PerceiverForOpticalFlow", - "SegformerDecodeHead", - "BeitForMaskedImageModeling", - "ChineseCLIPTextModel", - "ChineseCLIPVisionModel", - "CLIPTextModelWithProjection", - "CLIPVisionModelWithProjection", - "ClvpForCausalLM", - "ClvpModel", - "GroupViTTextModel", - "GroupViTVisionModel", - "DetrForSegmentation", - "Pix2StructVisionModel", - "Pix2StructTextModel", - "ConditionalDetrForSegmentation", - "DPRReader", - "FlaubertForQuestionAnswering", - "FlavaImageCodebook", - "FlavaTextModel", - "FlavaImageModel", - "FlavaMultimodalModel", - "GPT2DoubleHeadsModel", - "GPTSw3DoubleHeadsModel", - "InstructBlipVisionModel", - "InstructBlipQFormerModel", - "InstructBlipVideoVisionModel", - "InstructBlipVideoQFormerModel", - "LayoutLMForQuestionAnswering", - "LukeForMaskedLM", - "LukeForEntityClassification", - "LukeForEntityPairClassification", - "LukeForEntitySpanClassification", - "MgpstrModel", - "OpenAIGPTDoubleHeadsModel", - "OwlViTTextModel", - "OwlViTVisionModel", - "Owlv2TextModel", - "Owlv2VisionModel", - "OwlViTForObjectDetection", - "PatchTSMixerForPrediction", - "PatchTSMixerForPretraining", - "RagModel", - "RagSequenceForGeneration", - "RagTokenForGeneration", - "RealmEmbedder", - "RealmForOpenQA", - "RealmScorer", - "RealmReader", - "Wav2Vec2ForCTC", - "HubertForCTC", - "SEWForCTC", - "SEWDForCTC", - "XLMForQuestionAnswering", - "XLNetForQuestionAnswering", - "SeparableConv1D", - "VisualBertForRegionToPhraseAlignment", - "VisualBertForVisualReasoning", - "VisualBertForQuestionAnswering", - "VisualBertForMultipleChoice", - "XCLIPVisionModel", - "XCLIPTextModel", - "AltCLIPTextModel", - "AltCLIPVisionModel", - "AltRobertaModel", - "TvltForAudioVisualClassification", - "BarkCausalModel", - "BarkCoarseModel", - "BarkFineModel", - "BarkSemanticModel", - "MusicgenMelodyModel", - "MusicgenModel", - "MusicgenForConditionalGeneration", - "SpeechT5ForSpeechToSpeech", - "SpeechT5ForTextToSpeech", - "SpeechT5HifiGan", - "VitMatteForImageMatting", - "SeamlessM4TTextToUnitModel", - "SeamlessM4TTextToUnitForConditionalGeneration", - "SeamlessM4TCodeHifiGan", - "SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech - "TvpForVideoGrounding", - "SeamlessM4Tv2NARTextToUnitModel", - "SeamlessM4Tv2NARTextToUnitForConditionalGeneration", - "SeamlessM4Tv2CodeHifiGan", - "SeamlessM4Tv2ForSpeechToSpeech", # no auto class for speech-to-speech - "SegGptForImageSegmentation", - "SiglipVisionModel", - "SiglipTextModel", - "Siglip2VisionModel", - "Siglip2TextModel", - "ChameleonVQVAE", # no autoclass for VQ-VAE models - "VitPoseForPoseEstimation", - "CLIPTextModel", - "MetaClip2TextModel", - "MetaClip2TextModelWithProjection", - "MetaClip2VisionModel", - "MetaClip2VisionModelWithProjection", - "MoshiForConditionalGeneration", # no auto class for speech-to-speech - "Emu3VQVAE", # no autoclass for VQ-VAE models - "Emu3TextModel", # Building part of bigger (tested) model - "JanusVQVAE", # no autoclass for VQ-VAE models - "JanusVisionModel", # Building part of bigger (tested) model - "Qwen2_5OmniTalkerForConditionalGeneration", # Building part of a bigger model - "Qwen2_5OmniTalkerModel", # Building part of a bigger model - "Qwen2_5OmniThinkerForConditionalGeneration", # Building part of a bigger model - "Qwen2_5OmniThinkerTextModel", # Building part of a bigger model - "Qwen2_5OmniToken2WavModel", # Building part of a bigger model - "Qwen2_5OmniToken2WavBigVGANModel", # Building part of a bigger model - "Qwen2_5OmniToken2WavDiTModel", # Building part of a bigger model - "CsmBackboneModel", # Building part of a bigger model - "CsmDepthDecoderModel", # Building part of a bigger model - "CsmDepthDecoderForCausalLM", # Building part of a bigger model - "CsmForConditionalGeneration", # Building part of a bigger model - "BltPatcher", # Building part of a bigger model, tested implicitly through BltForCausalLM - "Florence2VisionBackbone", # Building part of a bigger model - "Qwen3OmniMoeCode2Wav", # Building part of a bigger model - "Qwen3OmniMoeCode2WavTransformerModel", # Building part of a bigger model - "Qwen3OmniMoeTalkerCodePredictorModel", # Building part of a bigger model - "Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration", # Building part of a bigger model - "Qwen3OmniMoeTalkerForConditionalGeneration", # Building part of a bigger model - "Qwen3OmniMoeTalkerModel", # Building part of a bigger model - "Qwen3OmniMoeThinkerForConditionalGeneration", # Building part of a bigger model - "Qwen3OmniMoeThinkerTextModel", # Building part of a bigger model - "CLIPTextTransformer", # Building part of bigger (tested) model. Tested implicitly through CLIPTextModel. - "CLIPVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through CLIPVisionModel. - "MetaClip2TextTransformer", # Building part of bigger (tested) model. Tested implicitly through MetaClip2TextModel. - "MetaClip2VisionTransformer", # Building part of bigger (tested) model. Tested implicitly through MetaClip2VisionModel. - "MLCDVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through MLCDVisionModel. -] +IGNORE_NON_AUTO_CONFIGURED = ( + PRIVATE_MODELS.copy() + + [ + # models to ignore for model xxx mapping + "Aimv2TextModel", + "AlignTextModel", + "AlignVisionModel", + "ClapTextModel", + "ClapTextModelWithProjection", + "ClapAudioModel", + "ClapAudioModelWithProjection", + "Blip2TextModelWithProjection", + "Blip2VisionModelWithProjection", + "Blip2VisionModel", + "ErnieMForInformationExtraction", + "FastSpeech2ConformerHifiGan", + "FastSpeech2ConformerWithHifiGan", + "GitVisionModel", + "GraphormerModel", + "GraphormerForGraphClassification", + "BlipForImageTextRetrieval", + "BlipForQuestionAnswering", + "BlipVisionModel", + "BlipTextLMHeadModel", + "BlipTextModel", + "BrosSpadeEEForTokenClassification", + "BrosSpadeELForTokenClassification", + "Swin2SRForImageSuperResolution", + "BridgeTowerForImageAndTextRetrieval", + "BridgeTowerForMaskedLM", + "BridgeTowerForContrastiveLearning", + "CLIPSegForImageSegmentation", + "CLIPSegVisionModel", + "CLIPSegTextModel", + "EsmForProteinFolding", + "GPTSanJapaneseModel", + "TimeSeriesTransformerForPrediction", + "InformerForPrediction", + "AutoformerForPrediction", + "PatchTSTForPretraining", + "PatchTSTForPrediction", + "JukeboxVQVAE", + "JukeboxPrior", + "SamModel", + "Sam2Model", + "Sam2VideoModel", + "EdgeTamModel", + "EdgeTamVideoModel", + "SamHQModel", + "DPTForDepthEstimation", + "DecisionTransformerGPT2Model", + "GLPNForDepthEstimation", + "ViltForImagesAndTextClassification", + "ViltForImageAndTextRetrieval", + "ViltForTokenClassification", + "ViltForMaskedLM", + "PerceiverForMultimodalAutoencoding", + "PerceiverForOpticalFlow", + "SegformerDecodeHead", + "BeitForMaskedImageModeling", + "ChineseCLIPTextModel", + "ChineseCLIPVisionModel", + "CLIPTextModelWithProjection", + "CLIPVisionModelWithProjection", + "ClvpForCausalLM", + "ClvpModel", + "GroupViTTextModel", + "GroupViTVisionModel", + "DetrForSegmentation", + "Pix2StructVisionModel", + "Pix2StructTextModel", + "ConditionalDetrForSegmentation", + "DPRReader", + "FlaubertForQuestionAnswering", + "FlavaImageCodebook", + "FlavaTextModel", + "FlavaImageModel", + "FlavaMultimodalModel", + "GPT2DoubleHeadsModel", + "GPTSw3DoubleHeadsModel", + "InstructBlipVisionModel", + "InstructBlipQFormerModel", + "InstructBlipVideoVisionModel", + "InstructBlipVideoQFormerModel", + "LayoutLMForQuestionAnswering", + "LukeForMaskedLM", + "LukeForEntityClassification", + "LukeForEntityPairClassification", + "LukeForEntitySpanClassification", + "MgpstrModel", + "OpenAIGPTDoubleHeadsModel", + "OwlViTTextModel", + "OwlViTVisionModel", + "Owlv2TextModel", + "Owlv2VisionModel", + "OwlViTForObjectDetection", + "PatchTSMixerForPrediction", + "PatchTSMixerForPretraining", + "RagModel", + "RagSequenceForGeneration", + "RagTokenForGeneration", + "RealmEmbedder", + "RealmForOpenQA", + "RealmScorer", + "RealmReader", + "Wav2Vec2ForCTC", + "HubertForCTC", + "SEWForCTC", + "SEWDForCTC", + "XLMForQuestionAnswering", + "XLNetForQuestionAnswering", + "SeparableConv1D", + "VisualBertForRegionToPhraseAlignment", + "VisualBertForVisualReasoning", + "VisualBertForQuestionAnswering", + "VisualBertForMultipleChoice", + "XCLIPVisionModel", + "XCLIPTextModel", + "AltCLIPTextModel", + "AltCLIPVisionModel", + "AltRobertaModel", + "TvltForAudioVisualClassification", + "BarkCausalModel", + "BarkCoarseModel", + "BarkFineModel", + "BarkSemanticModel", + "MusicgenMelodyModel", + "MusicgenModel", + "MusicgenForConditionalGeneration", + "SpeechT5ForSpeechToSpeech", + "SpeechT5ForTextToSpeech", + "SpeechT5HifiGan", + "VitMatteForImageMatting", + "SeamlessM4TTextToUnitModel", + "SeamlessM4TTextToUnitForConditionalGeneration", + "SeamlessM4TCodeHifiGan", + "SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech + "TvpForVideoGrounding", + "SeamlessM4Tv2NARTextToUnitModel", + "SeamlessM4Tv2NARTextToUnitForConditionalGeneration", + "SeamlessM4Tv2CodeHifiGan", + "SeamlessM4Tv2ForSpeechToSpeech", # no auto class for speech-to-speech + "SegGptForImageSegmentation", + "SiglipVisionModel", + "SiglipTextModel", + "Siglip2VisionModel", + "Siglip2TextModel", + "ChameleonVQVAE", # no autoclass for VQ-VAE models + "VitPoseForPoseEstimation", + "CLIPTextModel", + "MetaClip2TextModel", + "MetaClip2TextModelWithProjection", + "MetaClip2VisionModel", + "MetaClip2VisionModelWithProjection", + "MoshiForConditionalGeneration", # no auto class for speech-to-speech + "Emu3VQVAE", # no autoclass for VQ-VAE models + "Emu3TextModel", # Building part of bigger (tested) model + "JanusVQVAE", # no autoclass for VQ-VAE models + "JanusVisionModel", # Building part of bigger (tested) model + "Qwen2_5OmniTalkerForConditionalGeneration", # Building part of a bigger model + "Qwen2_5OmniTalkerModel", # Building part of a bigger model + "Qwen2_5OmniThinkerForConditionalGeneration", # Building part of a bigger model + "Qwen2_5OmniThinkerTextModel", # Building part of a bigger model + "Qwen2_5OmniToken2WavModel", # Building part of a bigger model + "Qwen2_5OmniToken2WavBigVGANModel", # Building part of a bigger model + "Qwen2_5OmniToken2WavDiTModel", # Building part of a bigger model + "CsmBackboneModel", # Building part of a bigger model + "CsmDepthDecoderModel", # Building part of a bigger model + "CsmDepthDecoderForCausalLM", # Building part of a bigger model + "CsmForConditionalGeneration", # Building part of a bigger model + "BltPatcher", # Building part of a bigger model, tested implicitly through BltForCausalLM + "Florence2VisionBackbone", # Building part of a bigger model + "Qwen3OmniMoeCode2Wav", # Building part of a bigger model + "Qwen3OmniMoeCode2WavTransformerModel", # Building part of a bigger model + "Qwen3OmniMoeTalkerCodePredictorModel", # Building part of a bigger model + "Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration", # Building part of a bigger model + "Qwen3OmniMoeTalkerForConditionalGeneration", # Building part of a bigger model + "Qwen3OmniMoeTalkerModel", # Building part of a bigger model + "Qwen3OmniMoeThinkerForConditionalGeneration", # Building part of a bigger model + "Qwen3OmniMoeThinkerTextModel", # Building part of a bigger model + "CLIPTextTransformer", # Building part of bigger (tested) model. Tested implicitly through CLIPTextModel. + "CLIPVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through CLIPVisionModel. + "MetaClip2TextTransformer", # Building part of bigger (tested) model. Tested implicitly through MetaClip2TextModel. + "MetaClip2VisionTransformer", # Building part of bigger (tested) model. Tested implicitly through MetaClip2VisionModel. + "MLCDVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through MLCDVisionModel. + ] +) # Update this list for models that have multiple model types for the same model doc. From bef3279f55347b541c75f4eb99bed592efe0e56f Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 20:18:21 +0200 Subject: [PATCH 8/9] oops --- docs/source/en/model_doc/mlcd.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/mlcd.md b/docs/source/en/model_doc/mlcd.md index 4822fd7df01e..4a603944f62d 100644 --- a/docs/source/en/model_doc/mlcd.md +++ b/docs/source/en/model_doc/mlcd.md @@ -75,7 +75,7 @@ print(f"Extracted features shape: {features.shape}") [[autodoc]] MLCDVisionConfig -## MLCDVisionModel +## MLCDVisionTransformer [[autodoc]] MLCDVisionTransformer - forward From d845e1a0cd6b336eeaf8a3538717f6e02aa80e93 Mon Sep 17 00:00:00 2001 From: vasqu Date: Mon, 20 Oct 2025 20:23:02 +0200 Subject: [PATCH 9/9] fix mlcd --- src/transformers/models/mlcd/modeling_mlcd.py | 6 +++--- src/transformers/models/mlcd/modular_mlcd.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 09423413e751..e874070e4543 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -388,9 +388,9 @@ def forward( hidden_states = inputs_embeds for encoder_layer in self.layers: hidden_states = encoder_layer( - hidden_states=hidden_states, - position_embeddings=position_embeddings, - attention_mask=attention_mask, + hidden_states, + position_embeddings, + attention_mask, **kwargs, ) diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index 6dd065c16373..c28f2b205c1e 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -328,9 +328,9 @@ def forward( hidden_states = inputs_embeds for encoder_layer in self.layers: hidden_states = encoder_layer( - hidden_states=hidden_states, - position_embeddings=position_embeddings, - attention_mask=attention_mask, + hidden_states, + position_embeddings, + attention_mask, **kwargs, )