fix test

jesse996 · jesse996 · commit c74dc8f9cc3f · 2025-10-27T09:35:35.000+08:00
Signed-off-by: jesse &lt;szxfml@gmail.com&gt;
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
@@ -18,25 +18,14 @@
 #
 
 import functools
-import json
 import os
 import signal
-import subprocess
-import sys
-import time
 from collections.abc import Sequence
-from typing import Any, Callable, Optional
+from typing import Callable
 
-import httpx
-import openai
-import requests
 import torch
 import torch.nn.functional as F
 from typing_extensions import ParamSpec
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.cli.serve import ServeSubcommand
-from vllm.model_executor.model_loader import get_model_loader
-from vllm.utils import FlexibleArgumentParser, get_open_port
 
 _P = ParamSpec("_P")
 
@@ -115,152 +104,3 @@ def check_embeddings_close(
                     f"\n{name_1}:\t{embeddings_1[:16]!r}")
 
         assert sim >= 1 - tol, fail_msg
-
-
-class RemoteOpenAIServer:
-    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
-
-    def _start_server(self, model: str, vllm_serve_args: list[str],
-                      env_dict: Optional[dict[str, str]]) -> None:
-        """Subclasses override this method to customize server process launch
-        """
-        env = os.environ.copy()
-        # the current process might initialize npu,
-        # to be safe, we should use spawn method
-        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
-        if env_dict is not None:
-            env.update(env_dict)
-        self.proc: subprocess.Popen = subprocess.Popen(
-            ["vllm", "serve", model, *vllm_serve_args],
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-        )
-
-    def __init__(self,
-                 model: str,
-                 vllm_serve_args: list[str],
-                 *,
-                 env_dict: Optional[dict[str, str]] = None,
-                 seed: Optional[int] = 0,
-                 auto_port: bool = True,
-                 max_wait_seconds: Optional[float] = None,
-                 override_hf_configs: Optional[dict[str, Any]] = None) -> None:
-        if auto_port:
-            if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
-                raise ValueError("You have manually specified the port "
-                                 "when `auto_port=True`.")
-
-            # No need for a port if using unix sockets
-            if "--uds" not in vllm_serve_args:
-                # Don't mutate the input args
-                vllm_serve_args = vllm_serve_args + [
-                    "--port", str(get_open_port())
-                ]
-        if seed is not None:
-            if "--seed" in vllm_serve_args:
-                raise ValueError("You have manually specified the seed "
-                                 f"when `seed={seed}`.")
-
-            vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
-
-        if override_hf_configs is not None:
-            vllm_serve_args = vllm_serve_args + [
-                "--hf-overrides",
-                json.dumps(override_hf_configs)
-            ]
-
-        parser = FlexibleArgumentParser(
-            description="vLLM's remote OpenAI server.")
-        subparsers = parser.add_subparsers(required=False, dest="subparser")
-        parser = ServeSubcommand().subparser_init(subparsers)
-        args = parser.parse_args(["--model", model, *vllm_serve_args])
-        self.uds = args.uds
-        if args.uds:
-            self.host = None
-            self.port = None
-        else:
-            self.host = str(args.host or 'localhost')
-            self.port = int(args.port)
-
-        self.show_hidden_metrics = \
-            args.show_hidden_metrics_for_version is not None
-
-        # download the model before starting the server to avoid timeout
-        is_local = os.path.isdir(model)
-        if not is_local:
-            engine_args = AsyncEngineArgs.from_cli_args(args)
-            model_config = engine_args.create_model_config()
-            load_config = engine_args.create_load_config()
-
-            model_loader = get_model_loader(load_config)
-            model_loader.download_model(model_config)
-
-        self._start_server(model, vllm_serve_args, env_dict)
-        max_wait_seconds = max_wait_seconds or 240
-        self._wait_for_server(url=self.url_for("health"),
-                              timeout=max_wait_seconds)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.proc.terminate()
-        try:
-            self.proc.wait(8)
-        except subprocess.TimeoutExpired:
-            # force kill if needed
-            self.proc.kill()
-
-    def _poll(self) -> Optional[int]:
-        """Subclasses override this method to customize process polling"""
-        return self.proc.poll()
-
-    def _wait_for_server(self, *, url: str, timeout: float):
-        # run health check
-        start = time.time()
-        client = (httpx.Client(transport=httpx.HTTPTransport(
-            uds=self.uds)) if self.uds else requests)
-        while True:
-            try:
-                if client.get(url).status_code == 200:
-                    break
-            except Exception:
-                # this exception can only be raised by requests.get,
-                # which means the server is not ready yet.
-                # the stack trace is not useful, so we suppress it
-                # by using `raise from None`.
-                result = self._poll()
-                if result is not None and result != 0:
-                    raise RuntimeError("Server exited unexpectedly.") from None
-
-                time.sleep(0.5)
-                if time.time() - start > timeout:
-                    raise RuntimeError(
-                        "Server failed to start in time.") from None
-
-    @property
-    def url_root(self) -> str:
-        return (f"http://{self.uds.split('/')[-1]}"
-                if self.uds else f"http://{self.host}:{self.port}")
-
-    def url_for(self, *parts: str) -> str:
-        return self.url_root + "/" + "/".join(parts)
-
-    def get_client(self, **kwargs):
-        if "timeout" not in kwargs:
-            kwargs["timeout"] = 600
-        return openai.OpenAI(
-            base_url=self.url_for("v1"),
-            api_key=self.DUMMY_API_KEY,
-            max_retries=0,
-            **kwargs,
-        )
-
-    def get_async_client(self, **kwargs):
-        if "timeout" not in kwargs:
-            kwargs["timeout"] = 600
-        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
-                                  api_key=self.DUMMY_API_KEY,
-                                  max_retries=0,
-                                  **kwargs)