PoC-AI-MCP/start_server.sh


								#!/bin/bash


								set -Eeuo pipefail


								# Start vLLM server with Qwen model

								# This script starts the vLLM OpenAI-compatible API server


								MODEL="${1:-Qwen/Qwen3-8B}"

								HOST="${2:-127.0.0.1}"

								PORT="${3:-8000}"


								echo "Starting vLLM server..."

								echo "Model: $MODEL"

								echo "Host: $HOST"

								echo "Port: $PORT"

								echo "Hugging Face Token: $HF_TOKEN"

								echo ""

								echo "Note: This will download the model if not already cached."

								echo "Press Ctrl+C to stop the server."

								echo ""


								mkdir -p ~/.cache/vllm


								# see https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm

								podman run --name vllm --rm --device nvidia.com/gpu=all -v ~/.cache/huggingface:/root/.cache/huggingface -v ~/.cache/vllm:/root/.cache/vllm --env "HF_TOKEN=$HF_TOKEN" -p $HOST:$PORT:8000 --ipc=host docker.io/vllm/vllm-openai:latest "$MODEL" --gpu-memory-utilization 0.95 --max-model-len 16384 --served-model-name qwen --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1