
AI模型从开发到部署,Docker已经成为事实标准。但AI应用的容器化比普通Web应用复杂得多——GPU支持、大模型文件、内存管理、推理优化……本文将手把手带你从零搭建一个生产级的AI应用Docker部署方案。
AI应用的基础镜像选择至关重要:
# 轻量级(仅CPU推理)
FROM python:3.11-slim
# GPU支持(推荐)
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04
# 完整开发环境
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04
# 设置环境变量
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=${CUDA_HOME}/bin:${PATH}
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
# 安装系统依赖
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3-pip \
python3.11-venv \
libgl1-mesa-glx \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# 创建工作目录
WORKDIR /app
# 安装Python依赖(分层缓存优化)
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY . .
# 下载模型(构建时下载,避免运行时下载)
RUN python3 download_model.py
# 暴露端口
EXPOSE 8000
# 健康检查
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# 启动命令
CMD ["python3", "app.py"]
version: '3.8'
services:
ai-app:
build: .
ports:
- "8000:8000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
environment:
- CUDA_VISIBLE_DEVICES=0
- MODEL_PATH=/app/models
volumes:
- ./models:/app/models
- ./logs:/app/logs
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0', '1'] # 指定GPU
capabilities: [gpu]
from fastapi import FastAPI
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
app = FastAPI(title="AI Inference Service")
# 加载模型
model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
@app.get("/health")
async def health():
return {"status": "healthy"}
@app.post("/generate")
async def generate(prompt: str, max_tokens: int = 512):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=0.7,
top_p=0.9
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"response": response}
fastapi==0.115.0
uvicorn==0.30.0
transformers==4.44.0
torch==2.4.0
accelerate==0.33.0
# 构建阶段
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS builder
WORKDIR /build
COPY requirements.txt .
RUN pip3 install --user --no-cache-dir -r requirements.txt
# 运行阶段
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04
COPY --from=builder /root/.local /root/.local
ENV PATH=/root/.local/bin:$PATH
WORKDIR /app
COPY . .
CMD ["python3", "app.py"]
# 使用vLLM官方镜像
FROM vllm/vllm-openai:latest
COPY app.py /app/app.py
WORKDIR /app
CMD ["--model", "Qwen/Qwen2.5-7B-Instruct", \
"--host", "0.0.0.0", \
"--port", "8000", \
"--tensor-parallel-size", "1"]
# 使用GGUF格式(CPU友好)
python3 convert_to_gguf.py --model ./model --outtype q4_k_m
# 使用量化减少内存占用
# 7B模型:FP16需14GB,INT4仅需4GB
from prometheus_fastapi_instrumentator import Instrumentator
app = FastAPI()
Instrumentator().instrument(app).expose(app)
# 自定义指标
from prometheus_client import Counter, Histogram
REQUEST_COUNT = Counter('inference_requests_total', 'Total inference requests')
INFERENCE_TIME = Histogram('inference_duration_seconds', 'Inference duration')
@app.post("/generate")
async def generate(prompt: str):
REQUEST_COUNT.inc()
with INFERENCE_TIME.time():
# 推理逻辑
pass
| 问题 | 原因 | 解决方案 |
|---|---|---|
| CUDA out of memory | 模型太大 | 使用量化、减小batch_size |
| 容器启动慢 | 模型加载时间长 | 预热脚本、模型缓存 |
| GPU不可用 | 驱动不匹配 | 统一host和container的CUDA版本 |
| 推理速度慢 | CPU瓶颈 | 使用vLLM、TensorRT |
Docker部署AI应用的关键在于:选择合适的基础镜像、正确配置GPU支持、优化模型加载和推理性能。掌握这些技巧后,你可以将任何AI应用快速部署到生产环境。
💡 独家建议:对于生产环境,强烈推荐使用vLLM替代原生Transformers进行推理,性能提升可达3-5倍。如果需要更高吞吐量,考虑使用Triton Inference Server。