#!/bin/bash

set -Eeuo pipefail

# Start vLLM server with Qwen model
# This script starts the vLLM OpenAI-compatible API server

MODEL="${1:-Qwen/Qwen3-8B}"
HOST="${2:-127.0.0.1}"
PORT="${3:-8000}"

echo "Starting vLLM server..."
echo "Model: $MODEL"
echo "Host: $HOST"
echo "Port: $PORT"
echo "Hugging Face Token: $HF_TOKEN"
echo ""
echo "Note: This will download the model if not already cached."
echo "Press Ctrl+C to stop the server."
echo ""

mkdir -p ~/.cache/vllm

# see https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm
podman run --name vllm --rm --device nvidia.com/gpu=all -v ~/.cache/huggingface:/root/.cache/huggingface -v ~/.cache/vllm:/root/.cache/vllm --env "HF_TOKEN=$HF_TOKEN" -p $HOST:$PORT:8000 --ipc=host docker.io/vllm/vllm-openai:latest "$MODEL" --gpu-memory-utilization 0.95 --max-model-len 16384 --served-model-name qwen --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1