@@ -59,7 +59,7 @@ def forward(self, image_features):
5959class FastVlmPreTrainedModel (PreTrainedModel ):
6060 config : FastVlmConfig
6161 base_model_prefix = "model"
62- input_modalities = [ "image" , "text" ]
62+ input_modalities = ( "image" , "text" )
6363 supports_gradient_checkpointing = True
6464 _skip_keys_device_placement = "past_key_values"
6565
@@ -195,12 +195,11 @@ def forward(
195195 ** kwargs : Unpack [TransformersKwargs ],
196196 ) -> Union [tuple , FastVlmModelOutputWithPast ]:
197197 r"""
198- vision_feature_select_strategy (`str`, *optional*):
199- The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
200-
201198 vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
202199 The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
203200 corresponding indices will be concatenated to form the vision features. Only -1 supported.
201+ vision_feature_select_strategy (`str`, *optional*):
202+ The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
204203 """
205204 vision_feature_layer = (
206205 vision_feature_layer if vision_feature_layer is not None else self .config .vision_feature_layer
@@ -335,18 +334,16 @@ def forward(
335334 ** kwargs : Unpack [TransformersKwargs ],
336335 ) -> Union [tuple , FastVlmCausalLMOutputWithPast ]:
337336 r"""
337+ vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
338+ The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
339+ corresponding indices will be concatenated to form the vision features. Only -1 supported.
340+ vision_feature_select_strategy (`str`, *optional*):
341+ The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
338342 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
339343 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
340344 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
341345 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
342346
343- vision_feature_select_strategy (`str`, *optional*):
344- The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
345-
346- vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
347- The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
348- corresponding indices will be concatenated to form the vision features. Only -1 supported.
349-
350347 Example:
351348
352349 ```python
0 commit comments