-
Notifications
You must be signed in to change notification settings - Fork 317
Expand file tree
/
Copy pathDockerfile
More file actions
64 lines (52 loc) · 2.28 KB
/
Dockerfile
File metadata and controls
64 lines (52 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
FROM nvidia/cuda:12.9.1-base-ubuntu22.04
RUN apt-get update -y \
&& apt-get install -y python3-pip
RUN ldconfig /usr/local/cuda-12.9/compat/
# Install vLLM with FlashInfer - use CUDA 12.8 PyTorch wheels (compatible with vLLM 0.15.1)
RUN python3 -m pip install --upgrade pip && \
python3 -m pip install "vllm[flashinfer]==0.16.0" --extra-index-url https://download.pytorch.org/whl/cu129
# Install additional Python dependencies (after vLLM to avoid PyTorch version conflicts)
COPY builder/requirements.txt /requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade -r /requirements.txt
# Setup for Option 2: Building the Image with the Model included
ARG MODEL_NAME=""
ARG TOKENIZER_NAME=""
ARG BASE_PATH="/runpod-volume"
ARG QUANTIZATION=""
ARG MODEL_REVISION=""
ARG TOKENIZER_REVISION=""
ARG VLLM_NIGHTLY="false"
ENV MODEL_NAME=$MODEL_NAME \
MODEL_REVISION=$MODEL_REVISION \
TOKENIZER_NAME=$TOKENIZER_NAME \
TOKENIZER_REVISION=$TOKENIZER_REVISION \
BASE_PATH=$BASE_PATH \
QUANTIZATION=$QUANTIZATION \
HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \
HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \
HF_HOME="${BASE_PATH}/huggingface-cache/hub" \
HF_HUB_ENABLE_HF_TRANSFER=0 \
# Suppress Ray metrics agent warnings (not needed in containerized environments)
RAY_METRICS_EXPORT_ENABLED=0 \
RAY_DISABLE_USAGE_STATS=1 \
# Prevent rayon thread pool panic in containers where ulimit -u < nproc
# (tokenizers uses Rust's rayon which tries to spawn threads = CPU cores)
TOKENIZERS_PARALLELISM=false \
RAYON_NUM_THREADS=4
ENV PYTHONPATH="/:/vllm-workspace"
RUN if [ "${VLLM_NIGHTLY}" = "true" ]; then \
pip install -U vllm --pre --index-url https://pypi.org/simple --extra-index-url https://wheels.vllm.ai/nightly && \
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* && \
pip install git+https://github.com/huggingface/transformers.git; \
fi
COPY src /src
RUN --mount=type=secret,id=HF_TOKEN,required=false \
if [ -f /run/secrets/HF_TOKEN ]; then \
export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \
fi && \
if [ -n "$MODEL_NAME" ]; then \
python3 /src/download_model.py; \
fi
# Start the handler
CMD ["python3", "/src/handler.py"]