alibaba · LLLLKKKK · Nov 18, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 17, 2025
@@ -568,7 +568,7 @@ filelock==3.20.0 \
     #   huggingface-hub
     #   torch
     #   transformers
-flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl \
+flash-attn @ https://rtp-opensource.oss-cn-hangzhou.aliyuncs.com/rtp_llm/flash_attn-2.7.4.post1%2Bcu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl \
     --hash=sha256:bfdb0f290cc3d21d0810ba49a360ef91090f62cdc1345ec6900447e0d12d99af
     # via -r open_source/deps/requirements_torch_gpu_cuda12.txt
 flashinfer-python==0.2.5 \

@@ -638,7 +638,7 @@ filelock==3.13.1 \
     #   huggingface-hub
     #   torch
     #   transformers
-flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl \
+flash-attn @ https://rtp-opensource.oss-cn-hangzhou.aliyuncs.com/rtp_llm/flash_attn-2.7.4.post1%2Bcu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl \
     --hash=sha256:bfdb0f290cc3d21d0810ba49a360ef91090f62cdc1345ec6900447e0d12d99af
     # via -r open_source/deps/requirements_torch_gpu_cuda12_9.txt
 flashinfer-python==0.2.5 \

@@ -4,7 +4,7 @@ autoawq>=0.2.9
 datasets
 https://mirrors.aliyun.com/pytorch-wheels/cu126/torch-2.6.0%2Bcu126-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=c55280b4da58e565d8a25e0e844dc27d0c96aaada7b90b4de70a45397faf604e
 https://mirrors.aliyun.com/pytorch-wheels/cu126/torchvision-0.21.0%2Bcu126-cp310-cp310-linux_x86_64.whl#sha256=db4369a89b866b319c8dd73931c3e5f314aa535f7035ae2336ce9a26d7ace15a
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
+https://rtp-opensource.oss-cn-hangzhou.aliyuncs.com/rtp_llm/flash_attn-2.7.4.post1%2Bcu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
 tensorrt==10.3.0
 tensorrt-cu12-bindings==10.3.0
 tensorrt-cu12-libs==10.3.0

@@ -4,7 +4,7 @@ autoawq
 datasets
 https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp310-cp310-manylinux_2_28_x86_64.whl
 https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp310-cp310-manylinux_2_28_x86_64.whl
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
+https://rtp-opensource.oss-cn-hangzhou.aliyuncs.com/rtp_llm/flash_attn-2.7.4.post1%2Bcu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
 tensorrt==10.3.0
 tensorrt-cu12-bindings==10.3.0
 tensorrt-cu12-libs==10.3.0

@@ -212,7 +212,6 @@ GptModelOutputs PyWrappedModel::forwardMicroBatched(const GptModelInputs& inputs
 GptModelOutputs PyWrappedModel::forward(const GptModelInputs& inputs) {
 
     py::gil_scoped_acquire gil;
-    printBufferDataDebug(*inputs.combo_position_ids, "forward inputs.combo_position_ids");
     try {
         RTP_LLM_LOG_DEBUG("Calling forward method on Python object instance.");
 

@@ -14,7 +14,6 @@ py_library(
     ]),
     deps = [
         "//rtp_llm/models_py/distributed:deepep_wrapper",
-        "//rtp_llm/models_py/distributed:process_group_state",
     ],
     visibility = ["//visibility:public"],
 )
@@ -44,14 +43,16 @@ py_library(
     name = "modules_cuda",
     srcs = glob([
         "modules/cuda/*.py",
-    ]),
+        "modules/cuda/**/*.py",
+    ])
 )
 
 py_library(
     name = "modules_rocm",
     srcs = glob([
         "modules/rocm/*.py",
-    ]),
+        "modules/rocm/**/*.py",
+    ])
 )
 
 py_library(

@@ -2,18 +2,9 @@ load("//bazel:arch_select.bzl", "deep_ep_py_deps")
 
 deep_ep_py_deps()
 
-py_library(
-    name = "process_group_state",
-    srcs = ["process_group_state.py"],
-    deps = [
-        "//rtp_llm:torch",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 py_library(
     name = "deepep_wrapper",
-    srcs = ["deepep_wrapper.py"],
+    srcs = ["deepep_wrapper.py", "deepep_initializer.py"],
     deps = [
         "//rtp_llm:torch",
     ] + select({

@@ -0,0 +1,91 @@
+"""DeepEP initialization manager
+
+Manages singleton initialization of DeepEP environment, ensuring thread safety.
+"""
+
+import logging
+import threading
+from typing import Optional
+
+import torch
+import torch.distributed
+
+from rtp_llm.config.gpt_init_model_parameters import GptInitModelParameters
+
+try:
+    import rtp_llm.models_py.distributed.deepep_wrapper as deepep_wrapper_module
+except Exception as e:
+    logging.error(f"DeepEP is not supported on this device: {e}")
+    deepep_wrapper_module = None
+
+
+class DeepEpInitializer:
+    """Singleton class for managing DeepEP initialization state"""
+
+    _initialized: bool = False
+    _lock: threading.Lock = threading.Lock()
+
+    @classmethod
+    def supported(cls) -> bool:
+        return deepep_wrapper_module is not None
+
+    @classmethod
+    def ensure_initialized(
+        cls, config: GptInitModelParameters, timeout: Optional[int] = None
+    ) -> None:
+        """Ensure DeepEP environment is initialized (thread-safe)
+
+        Args:
+            config: Model initialization parameters
+        """
+        if cls._initialized:
+            return
+
+        if not cls.supported():
+            raise RuntimeError("DeepEP is not supported on this device")
+
+        with cls._lock:
+            if cls._initialized:
+                return
+            cls._do_initialization(config, timeout)
+            cls._initialized = True
+
+    @classmethod
+    def get_deepep_wrapper(cls, config: GptInitModelParameters):
+        cls.ensure_initialized(config)
+        assert deepep_wrapper_module is not None
+        return deepep_wrapper_module.get_deepep_wrapper()
+
+    @classmethod
+    def _do_initialization(
+        cls, config: GptInitModelParameters, timeout: Optional[int]
+    ) -> None:
+        """Perform actual initialization logic
+
+        Args:
+            config: Model initialization parameters
+        """
+        assert (
+            torch.distributed.is_initialized()
+        ), "Distributed environment is not initialized"
+        assert deepep_wrapper_module is not None, "deepep_wrapper is not imported"
+        default_group = torch.distributed.group.WORLD
+        assert default_group is not None, "Default process group is not initialized"
+        deepep_wrapper_module.init_deepep_wrapper(group=default_group, params=config)
+
+    @classmethod
+    def is_initialized(cls) -> bool:
+        """Check if initialized
+
+        Returns:
+            Whether initialized
+        """
+        return cls._initialized
+
+    @classmethod
+    def reset(cls) -> None:
+        """Reset initialization state (for testing only)"""
+        with cls._lock:
+            if cls._initialized and deepep_wrapper_module is not None:
+                deepep_wrapper_module.destroy_deepep_wrapper()
+            cls._initialized = False