Skip to content

Commit 3a6acad

Browse files
[Model] Enable encoder DP for MiniCPM-V (#23948)
Signed-off-by: zjy0516 <[email protected]> Signed-off-by: Jiangyun Zhu <[email protected]> Co-authored-by: Cyrus Leung <[email protected]>
1 parent 5490d63 commit 3a6acad

File tree

2 files changed

+30
-15
lines changed

2 files changed

+30
-15
lines changed

docs/configuration/optimization.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
175175
Known supported models:
176176

177177
- Llama4 (<gh-pr:18368>)
178-
- MiniCPM-V-4 (<gh-pr:23327>)
178+
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
179179
- Qwen2.5-VL (<gh-pr:22742>)
180180
- Step3 (<gh-pr:22697>)
181181

vllm/model_executor/models/minicpmv.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
977977
instantiated.
978978
"""
979979

980+
supports_encoder_tp_data = True
981+
980982
@classmethod
981983
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
982984
if modality.startswith("image"):
@@ -990,6 +992,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
990992
config = vllm_config.model_config.hf_config
991993
multimodal_config = vllm_config.model_config.multimodal_config
992994
quant_config = vllm_config.quant_config
995+
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
993996
super().__init__()
994997
# All MiniCPM-V models disable `tie_word_embeddings` but
995998
# `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
@@ -1237,6 +1240,8 @@ def get_vision_hidden_states(
12371240

12381241
class MiniCPMV2_0(MiniCPMVBaseModel):
12391242

1243+
supports_encoder_tp_data = False
1244+
12401245
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
12411246
super().__init__(vllm_config=vllm_config, prefix=prefix)
12421247
assert self.version == (2, 0)
@@ -1351,9 +1356,12 @@ def init_vision_module(
13511356
quant_config: Optional[QuantizationConfig],
13521357
prefix: str = "",
13531358
) -> nn.Module:
1354-
model = Idefics2VisionTransformer(config.vision_config,
1355-
quant_config=quant_config,
1356-
prefix=prefix)
1359+
model = Idefics2VisionTransformer(
1360+
config.vision_config,
1361+
quant_config=quant_config,
1362+
prefix=prefix,
1363+
use_data_parallel=self.use_data_parallel,
1364+
)
13571365
if self.config.drop_vision_last_layer:
13581366
model.encoder.layers = model.encoder.layers[:-1]
13591367
return model
@@ -1441,9 +1449,12 @@ def init_vision_module(
14411449
quant_config: Optional[QuantizationConfig] = None,
14421450
prefix: str = "",
14431451
) -> nn.Module:
1444-
model = Idefics2VisionTransformer(config.vision_config,
1445-
quant_config=quant_config,
1446-
prefix=prefix)
1452+
model = Idefics2VisionTransformer(
1453+
config.vision_config,
1454+
quant_config=quant_config,
1455+
prefix=prefix,
1456+
use_data_parallel=self.use_data_parallel,
1457+
)
14471458
if self.config.drop_vision_last_layer:
14481459
model.encoder.layers = model.encoder.layers[:-1]
14491460
return model
@@ -1521,8 +1532,6 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
15211532
],
15221533
}
15231534

1524-
supports_encoder_tp_data = True
1525-
15261535
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
15271536
super().__init__(vllm_config=vllm_config, prefix=prefix)
15281537
assert self.version == (4, 0)
@@ -1546,9 +1555,12 @@ def init_vision_module(
15461555
prefix: str = "",
15471556
) -> nn.Module:
15481557
quant_config = self._maybe_ignore_quant_config(quant_config)
1549-
model = Idefics2VisionTransformer(config.vision_config,
1550-
quant_config=quant_config,
1551-
prefix=prefix)
1558+
model = Idefics2VisionTransformer(
1559+
config.vision_config,
1560+
quant_config=quant_config,
1561+
prefix=prefix,
1562+
use_data_parallel=self.use_data_parallel,
1563+
)
15521564
if self.config.drop_vision_last_layer:
15531565
model.encoder.layers = model.encoder.layers[:-1]
15541566
return model
@@ -1652,9 +1664,12 @@ def init_vision_module(
16521664
prefix: str = "",
16531665
) -> nn.Module:
16541666
quant_config = self._maybe_ignore_quant_config(quant_config)
1655-
model = Idefics2VisionTransformer(config.vision_config,
1656-
quant_config=quant_config,
1657-
prefix=prefix)
1667+
model = Idefics2VisionTransformer(
1668+
config.vision_config,
1669+
quant_config=quant_config,
1670+
prefix=prefix,
1671+
use_data_parallel=self.use_data_parallel,
1672+
)
16581673
if self.config.drop_vision_last_layer:
16591674
model.encoder.layers = model.encoder.layers[:-1]
16601675
return model

0 commit comments

Comments
 (0)