You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
25 lines
951 B
25 lines
951 B
#!/bin/bash
|
|
|
|
set -Eeuo pipefail
|
|
|
|
# Start vLLM server with Qwen model
|
|
# This script starts the vLLM OpenAI-compatible API server
|
|
|
|
MODEL="${1:-Qwen/Qwen3-8B}"
|
|
HOST="${2:-127.0.0.1}"
|
|
PORT="${3:-8000}"
|
|
|
|
echo "Starting vLLM server..."
|
|
echo "Model: $MODEL"
|
|
echo "Host: $HOST"
|
|
echo "Port: $PORT"
|
|
echo "Hugging Face Token: $HF_TOKEN"
|
|
echo ""
|
|
echo "Note: This will download the model if not already cached."
|
|
echo "Press Ctrl+C to stop the server."
|
|
echo ""
|
|
|
|
mkdir -p ~/.cache/vllm
|
|
|
|
# see https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm
|
|
podman run --name vllm --rm --device nvidia.com/gpu=all -v ~/.cache/huggingface:/root/.cache/huggingface -v ~/.cache/vllm:/root/.cache/vllm --env "HF_TOKEN=$HF_TOKEN" -p $HOST:$PORT:8000 --ipc=host docker.io/vllm/vllm-openai:latest "$MODEL" --gpu-memory-utilization 0.95 --max-model-len 16384 --served-model-name qwen --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
|
|
|