Skip to content

Commit af69342

Browse files
gc-fuhzjane
andauthored
Upgrade to vLLM 0.6.6 (#12796)
* init * update engine init * fix serving load_in_low_bit problem * temp * temp * temp * temp * temp * fix * fixed * done * fix * fix all arguments * fix * fix throughput script * fix * fix * use official ipex-llm * Fix readme * fix --------- Co-authored-by: hzjane <[email protected]>
1 parent f8ab833 commit af69342

File tree

14 files changed

+1000
-907
lines changed

14 files changed

+1000
-907
lines changed
Lines changed: 89 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,62 @@
1-
FROM intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
1+
# First stage: build oneccl
2+
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS build
3+
4+
ARG http_proxy
5+
ARG https_proxy
6+
7+
ENV TZ=Asia/Shanghai
8+
ENV PYTHONUNBUFFERED=1
9+
10+
ARG PIP_NO_CACHE_DIR=false
11+
12+
ADD ./ccl_torch.patch /tmp/
13+
14+
RUN apt-get update && \
15+
apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \
16+
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
17+
env DEBIAN_FRONTEND=noninteractive apt-get update && \
18+
# add-apt-repository requires gnupg, gpg-agent, software-properties-common
19+
apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common && \
20+
# Add Python 3.11 PPA repository
21+
add-apt-repository ppa:deadsnakes/ppa -y && \
22+
apt-get install -y --no-install-recommends python3.11 git curl wget && \
23+
rm /usr/bin/python3 && \
24+
ln -s /usr/bin/python3.11 /usr/bin/python3 && \
25+
ln -s /usr/bin/python3 /usr/bin/python && \
26+
apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils && \
27+
wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py && \
28+
# Install FastChat from source requires PEP 660 support
29+
python3 get-pip.py && \
30+
rm get-pip.py && \
31+
pip install --upgrade requests argparse urllib3 && \
32+
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \
33+
# If we do not install this compute-runtime, we will fail the build later
34+
mkdir -p /tmp/neo && \
35+
cd /tmp/neo && \
36+
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-core-2_2.5.6+18417_amd64.deb && \
37+
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-opencl-2_2.5.6+18417_amd64.deb && \
38+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu-dbgsym_1.6.32224.5_amd64.ddeb && \
39+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu_1.6.32224.5_amd64.deb && \
40+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd-dbgsym_24.52.32224.5_amd64.ddeb && \
41+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd_24.52.32224.5_amd64.deb && \
42+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/libigdgmm12_22.5.5_amd64.deb && \
43+
dpkg -i *.deb && \
44+
pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/test/xpu && \
45+
mkdir /build && \
46+
cd /build && \
47+
git clone https://github.com/intel/torch-ccl.git && \
48+
cd torch-ccl && \
49+
git checkout ccl_torch2.5.0+xpu && \
50+
git submodule sync && \
51+
git submodule update --init --recursive && \
52+
# This patch will enable build torch-ccl with pytorch 2.6 environment
53+
git apply /tmp/ccl_torch.patch && \
54+
USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py bdist_wheel
55+
# File path: /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl
56+
57+
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04
58+
59+
COPY --from=build /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl
260

361
ARG http_proxy
462
ARG https_proxy
@@ -11,22 +69,12 @@ ENV VLLM_RPC_TIMEOUT=100000
1169

1270
# Disable pip's cache behavior
1371
ARG PIP_NO_CACHE_DIR=false
14-
ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch
15-
ADD ./oneccl-binding.patch /tmp/oneccl-binding.patch
1672

17-
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
18-
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
19-
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
20-
rm /etc/apt/sources.list.d/intel-graphics.list && \
21-
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
22-
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
23-
chmod 644 /usr/share/keyrings/intel-graphics.gpg && \
24-
apt-get update && \
73+
RUN apt-get update && \
2574
apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \
2675
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
2776
env DEBIAN_FRONTEND=noninteractive apt-get update && \
28-
# add-apt-repository requires gnupg, gpg-agent, software-properties-common
29-
apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common && \
77+
apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common kmod && \
3078
# Add Python 3.11 PPA repository
3179
add-apt-repository ppa:deadsnakes/ppa -y && \
3280
apt-get install -y --no-install-recommends python3.11 git curl wget && \
@@ -35,81 +83,59 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
3583
ln -s /usr/bin/python3 /usr/bin/python && \
3684
apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils && \
3785
wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py && \
38-
# Install FastChat from source requires PEP 660 support
3986
python3 get-pip.py && \
4087
rm get-pip.py && \
4188
pip install --upgrade requests argparse urllib3 && \
42-
pip install --pre --upgrade ipex-llm[xpu,serving] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
89+
pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/test/xpu && \
4390
pip install transformers_stream_generator einops tiktoken && \
4491
pip install --upgrade colorama && \
45-
# Download all-in-one benchmark and examples
46-
git clone https://github.com/intel-analytics/ipex-llm && \
47-
# The following comment segment is used when building from source...
48-
# cd ipex-llm && \
49-
# git fetch origin pull/12338/head:local_pr && \
50-
# git checkout local_pr && \
51-
# pip uninstall -y ipex-llm && \
52-
# cd python/llm && \
53-
# python setup.py install && \
54-
# cd ../../../ && \
92+
git clone https://github.com/intel/ipex-llm.git && \
5593
cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
5694
cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
95+
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
96+
rm -rf ./ipex-llm && \
5797
# Install vllm dependencies
5898
pip install --upgrade fastapi && \
5999
pip install --upgrade "uvicorn[standard]" && \
60-
# Download vLLM-Serving
61-
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
62-
rm -rf ./ipex-llm && \
63100
# Install torch-ccl
64-
cd /tmp/ && \
65-
pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
66-
# Internal oneccl
67-
wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.6.5-release/oneccl_wks_installer_2024.0.0.6.5.sh && \
68-
bash oneccl_wks_installer_2024.0.0.6.5.sh && \
69-
git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \
70-
cd torch-ccl && \
71-
patch -p1 < /tmp/oneccl-binding.patch && \
72-
USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install && \
101+
pip install /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl && \
102+
# install Internal oneccl
103+
cd /opt && \
104+
wget https://sourceforge.net/projects/oneccl-wks/files/2025.0.0.6.6-release/oneccl_wks_installer_2025.0.0.6.6.sh && \
105+
bash oneccl_wks_installer_2025.0.0.6.6.sh && \
73106
apt-get update && \
74107
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \
75-
# apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
108+
# Install compute runtime
76109
mkdir -p /tmp/neo && \
77110
cd /tmp/neo && \
78-
wget https://github.com/oneapi-src/level-zero/releases/download/v1.18.5/level-zero_1.18.5+u22.04_amd64.deb && \
79-
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17791.9/intel-igc-core_1.0.17791.9_amd64.deb && \
80-
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17791.9/intel-igc-opencl_1.0.17791.9_amd64.deb && \
81-
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/intel-level-zero-gpu_1.6.31294.12_amd64.deb && \
82-
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/intel-opencl-icd_24.39.31294.12_amd64.deb && \
83-
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/libigdgmm12_22.5.2_amd64.deb && \
111+
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-core-2_2.5.6+18417_amd64.deb && \
112+
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-opencl-2_2.5.6+18417_amd64.deb && \
113+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu-dbgsym_1.6.32224.5_amd64.ddeb && \
114+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu_1.6.32224.5_amd64.deb && \
115+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd-dbgsym_24.52.32224.5_amd64.ddeb && \
116+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd_24.52.32224.5_amd64.deb && \
117+
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/libigdgmm12_22.5.5_amd64.deb && \
84118
dpkg -i *.deb && \
85-
rm -rf /tmp/neo && \
86119
mkdir -p /llm && \
87120
cd /llm && \
88-
git clone -b 0.6.2 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
121+
rm -rf /tmp/neo && \
122+
# Install vllm
123+
git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \
89124
cd /llm/vllm && \
90125
pip install setuptools-scm && \
91126
pip install --upgrade cmake && \
92127
VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm && \
93-
# pip install -r /llm/vllm/requirements-xpu.txt && \
94-
# VLLM_TARGET_DEVICE=xpu python setup.py install && \
95128
pip install mpi4py fastapi uvicorn openai && \
96129
pip install gradio==4.43.0 && \
97-
# pip install transformers==4.44.2 && \
98-
# patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch && \
99-
pip install ray && \
100-
patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch
130+
pip install ray
101131

102-
COPY ./vllm_online_benchmark.py /llm/
103-
COPY ./vllm_offline_inference.py /llm/
132+
COPY ./vllm_online_benchmark.py /llm/
133+
COPY ./vllm_offline_inference.py /llm/
104134
COPY ./vllm_offline_inference_vision_language.py /llm/
105-
COPY ./payload-1024.lua /llm/
106-
COPY ./start-vllm-service.sh /llm/
107-
COPY ./benchmark_vllm_throughput.py /llm/
108-
COPY ./benchmark_vllm_latency.py /llm/
109-
COPY ./start-fastchat-service.sh /llm/
110-
COPY ./start-pp_serving-service.sh /llm/
111-
COPY ./start-lightweight_serving-service.sh /llm/
112-
113-
ENV LD_LIBRARY_PATH /usr/local/lib/python3.11/dist-packages/intel_extension_for_pytorch/lib/:/opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/ippcp/2021.11/lib/:/opt/intel/oneapi/ipp/2021.11/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/dnnl/2024.1/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/dal/2024.2/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/opt/intel/oneapi/ccl/2021.12/lib/
135+
COPY ./payload-1024.lua /llm/
136+
COPY ./start-vllm-service.sh /llm/
137+
COPY ./benchmark_vllm_throughput.py /llm/
138+
COPY ./benchmark_vllm_latency.py /llm/
139+
COPY ./start-pp_serving-service.sh /llm/
114140

115141
WORKDIR /llm/

docker/llm/serving/xpu/docker/README.md

Lines changed: 1 addition & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ To map the `xpu` into the container, you need to specify `--device=/dev/dri` whe
1818
An example could be:
1919
```bash
2020
#/bin/bash
21-
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT
21+
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest
2222

2323
sudo docker run -itd \
2424
--net=host \
@@ -59,86 +59,6 @@ To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to t
5959
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.
6060

6161

62-
#### FastChat serving engine
63-
64-
To set up model serving using `IPEX-LLM` as backend using FastChat, you can refer to this [quickstart](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/fastchat_quickstart.html#) or follow these quick steps to deploy a demo.
65-
66-
##### Quick Setup for FastChat with IPEX-LLM
67-
68-
1. **Start the Docker Container**
69-
70-
Run the following command to launch a Docker container with device access:
71-
72-
```bash
73-
#/bin/bash
74-
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest
75-
76-
sudo docker run -itd \
77-
--net=host \
78-
--device=/dev/dri \
79-
--name=demo-container \
80-
# Example: map host model directory to container
81-
-v /LLM_MODELS/:/llm/models/ \
82-
--shm-size="16g" \
83-
# Optional: set proxy if needed
84-
-e http_proxy=... \
85-
-e https_proxy=... \
86-
-e no_proxy="127.0.0.1,localhost" \
87-
$DOCKER_IMAGE
88-
```
89-
90-
2. **Start the FastChat Service**
91-
92-
Enter the container and start the FastChat service:
93-
```bash
94-
#/bin/bash
95-
96-
# This command assumes that you have mapped the host model directory to the container
97-
# and the model directory is /llm/models/
98-
# we take Yi-1.5-34B as an example, and you can replace it with your own model
99-
100-
ps -ef | grep "fastchat" | awk '{print $2}' | xargs kill -9
101-
pip install -U gradio==4.43.0
102-
103-
# start controller
104-
python -m fastchat.serve.controller &
105-
106-
export USE_XETLA=OFF
107-
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
108-
109-
export TORCH_LLM_ALLREDUCE=0
110-
export CCL_DG2_ALLREDUCE=1
111-
# CCL needed environment variables
112-
export CCL_WORKER_COUNT=4
113-
# pin ccl worker to cores
114-
# export CCL_WORKER_AFFINITY=32,33,34,35
115-
export FI_PROVIDER=shm
116-
export CCL_ATL_TRANSPORT=ofi
117-
export CCL_ZE_IPC_EXCHANGE=sockets
118-
export CCL_ATL_SHM=1
119-
120-
source /opt/intel/1ccl-wks/setvars.sh
121-
122-
python -m ipex_llm.serving.fastchat.vllm_worker \
123-
--model-path /llm/models/Yi-1.5-34B \
124-
--device xpu \
125-
--enforce-eager \
126-
--disable-async-output-proc \
127-
--distributed-executor-backend ray \
128-
--dtype float16 \
129-
--load-in-low-bit fp8 \
130-
--tensor-parallel-size 4 \
131-
--gpu-memory-utilization 0.9 \
132-
--max-model-len 4096 \
133-
--max-num-batched-tokens 8000 &
134-
135-
sleep 120
136-
137-
python -m fastchat.serve.gradio_web_server &
138-
```
139-
140-
This quick setup allows you to deploy FastChat with IPEX-LLM efficiently.
141-
14262
#### vLLM serving engine
14363

14464
To run vLLM engine using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/DockerGuides/vllm_docker_quickstart.md).

0 commit comments

Comments
 (0)