A Proof of Concept of vLLM at the Edge with MCP calling
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

25 lines
951 B

#!/bin/bash
set -Eeuo pipefail
# Start vLLM server with Qwen model
# This script starts the vLLM OpenAI-compatible API server
MODEL="${1:-Qwen/Qwen3-8B}"
HOST="${2:-127.0.0.1}"
PORT="${3:-8000}"
echo "Starting vLLM server..."
echo "Model: $MODEL"
echo "Host: $HOST"
echo "Port: $PORT"
echo "Hugging Face Token: $HF_TOKEN"
echo ""
echo "Note: This will download the model if not already cached."
echo "Press Ctrl+C to stop the server."
echo ""
mkdir -p ~/.cache/vllm
# see https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm
podman run --name vllm --rm --device nvidia.com/gpu=all -v ~/.cache/huggingface:/root/.cache/huggingface -v ~/.cache/vllm:/root/.cache/vllm --env "HF_TOKEN=$HF_TOKEN" -p $HOST:$PORT:8000 --ipc=host docker.io/vllm/vllm-openai:latest "$MODEL" --gpu-memory-utilization 0.95 --max-model-len 16384 --served-model-name qwen --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1