#!/bin/bash set -Eeuo pipefail # Start vLLM server with Qwen model # This script starts the vLLM OpenAI-compatible API server MODEL="${1:-Qwen/Qwen3-8B}" HOST="${2:-127.0.0.1}" PORT="${3:-8000}" echo "Starting vLLM server..." echo "Model: $MODEL" echo "Host: $HOST" echo "Port: $PORT" echo "Hugging Face Token: $HF_TOKEN" echo "" echo "Note: This will download the model if not already cached." echo "Press Ctrl+C to stop the server." echo "" mkdir -p ~/.cache/vllm # see https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm podman run --name vllm --rm --device nvidia.com/gpu=all -v ~/.cache/huggingface:/root/.cache/huggingface -v ~/.cache/vllm:/root/.cache/vllm --env "HF_TOKEN=$HF_TOKEN" -p $HOST:$PORT:8000 --ipc=host docker.io/vllm/vllm-openai:latest "$MODEL" --gpu-memory-utilization 0.95 --max-model-len 16384 --served-model-name qwen --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1