commit
8b361316cc
8 changed files with 461 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||||
|
.claude |
||||
|
venv |
||||
@ -0,0 +1,77 @@ |
|||||
|
# AI model at the Edge with MCP support |
||||
|
|
||||
|
This is a Proof of Concept of a generative AI model (LLM) at the edge, with MCP server support. |
||||
|
|
||||
|
The idea is for the client to call the AI model to know the weather in Paris. |
||||
|
The weather data is provided by an MCP server. |
||||
|
|
||||
|
There are three components in this project: |
||||
|
|
||||
|
- the vLLM server, serving the Qwen/Qwen3-8B model. |
||||
|
- the Python client, calling vLLM. |
||||
|
- the MCP server serves over stdio the weather for a city (responses are hardcoded). |
||||
|
- the MCP server has to be declared to the AI model and called by the client when the AI model needs it. |
||||
|
|
||||
|
## Prerequisites |
||||
|
|
||||
|
- Python 3.8 or higher |
||||
|
- 8GB+ RAM (for the model) |
||||
|
- Internet connection (for first-time model download) |
||||
|
|
||||
|
## Step 1: Setup |
||||
|
|
||||
|
```bash |
||||
|
./setup.sh |
||||
|
``` |
||||
|
|
||||
|
Wait for all dependencies to install. |
||||
|
|
||||
|
## Step 2: Start the vLLM Server |
||||
|
|
||||
|
Open a terminal and run: |
||||
|
|
||||
|
```bash |
||||
|
./start_server.sh |
||||
|
``` |
||||
|
|
||||
|
Wait for the model to download and the server to start. You'll see: |
||||
|
``` |
||||
|
INFO: Application startup complete. |
||||
|
INFO: Uvicorn running on http://127.0.0.1:8000 |
||||
|
``` |
||||
|
|
||||
|
**Keep this terminal open!** |
||||
|
|
||||
|
## Step 3: Run the Client |
||||
|
|
||||
|
Open a **new terminal** and run: |
||||
|
|
||||
|
```bash |
||||
|
./run_demo.sh |
||||
|
``` |
||||
|
|
||||
|
You should see the client: |
||||
|
1. Connect to the MCP weather server |
||||
|
2. Ask about weather in Paris |
||||
|
3. The LLM calls the `get_weather` tool |
||||
|
4. Returns a natural language response with the weather data |
||||
|
|
||||
|
## Example Output |
||||
|
|
||||
|
``` |
||||
|
Connected to MCP server. Available tools: |
||||
|
- get_weather: Get the current weather for a specific city. |
||||
|
|
||||
|
User: What's the weather like in Paris? |
||||
|
|
||||
|
Assistant wants to call 1 tool(s): |
||||
|
|
||||
|
Calling tool: get_weather |
||||
|
Arguments: {'city': 'Paris'} |
||||
|
Result: {"city": "Paris", "temperature": 15, ...} |
||||
|
``` |
||||
|
|
||||
|
## Authors |
||||
|
|
||||
|
- Claude Code |
||||
|
- Nicolas Massé |
||||
@ -0,0 +1,217 @@ |
|||||
|
#!/usr/bin/env python3 |
||||
|
""" |
||||
|
MCP Client - Integrates vLLM with MCP weather server |
||||
|
""" |
||||
|
|
||||
|
import asyncio |
||||
|
import json |
||||
|
import os |
||||
|
from typing import Optional |
||||
|
from contextlib import asynccontextmanager |
||||
|
|
||||
|
from mcp import ClientSession, StdioServerParameters |
||||
|
from mcp.client.stdio import stdio_client |
||||
|
from openai import OpenAI |
||||
|
|
||||
|
|
||||
|
class MCPWeatherClient: |
||||
|
"""Client that connects vLLM to MCP weather server""" |
||||
|
|
||||
|
def __init__(self, vllm_url: str = "http://127.0.0.1:8000/v1", model: str = "qwen"): |
||||
|
self.vllm_url = vllm_url |
||||
|
self.model = model |
||||
|
self.openai_client = OpenAI( |
||||
|
api_key="EMPTY", # vLLM doesn't need API key |
||||
|
base_url=vllm_url |
||||
|
) |
||||
|
self.mcp_session: Optional[ClientSession] = None |
||||
|
self.available_tools = [] |
||||
|
|
||||
|
@asynccontextmanager |
||||
|
async def connect_to_mcp(self, server_script_path: str): |
||||
|
"""Connect to MCP server via stdio""" |
||||
|
server_params = StdioServerParameters( |
||||
|
command="python", |
||||
|
args=[server_script_path], |
||||
|
env=None |
||||
|
) |
||||
|
|
||||
|
async with stdio_client(server_params) as (read, write): |
||||
|
async with ClientSession(read, write) as session: |
||||
|
await session.initialize() |
||||
|
self.mcp_session = session |
||||
|
|
||||
|
# List available tools |
||||
|
tools_list = await session.list_tools() |
||||
|
print(f"\nConnected to MCP server. Available tools:") |
||||
|
for tool in tools_list.tools: |
||||
|
print(f" - {tool.name}: {tool.description}") |
||||
|
self.available_tools.append(tool) |
||||
|
|
||||
|
yield session |
||||
|
|
||||
|
def mcp_tools_to_openai_format(self): |
||||
|
"""Convert MCP tools to OpenAI function calling format""" |
||||
|
openai_tools = [] |
||||
|
|
||||
|
for tool in self.available_tools: |
||||
|
# Convert MCP tool schema to OpenAI format |
||||
|
tool_def = { |
||||
|
"type": "function", |
||||
|
"function": { |
||||
|
"name": tool.name, |
||||
|
"description": tool.description or "", |
||||
|
"parameters": tool.inputSchema if tool.inputSchema else { |
||||
|
"type": "object", |
||||
|
"properties": {}, |
||||
|
"required": [] |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
openai_tools.append(tool_def) |
||||
|
|
||||
|
return openai_tools |
||||
|
|
||||
|
async def call_mcp_tool(self, tool_name: str, arguments: dict): |
||||
|
"""Call an MCP tool and return the result""" |
||||
|
if not self.mcp_session: |
||||
|
raise RuntimeError("MCP session not initialized") |
||||
|
|
||||
|
result = await self.mcp_session.call_tool(tool_name, arguments) |
||||
|
return result |
||||
|
|
||||
|
async def chat(self, user_message: str, max_iterations: int = 5): |
||||
|
""" |
||||
|
Run a chat interaction with the model, handling tool calls via MCP |
||||
|
|
||||
|
Args: |
||||
|
user_message: The user's question/message |
||||
|
max_iterations: Maximum number of turns to prevent infinite loops |
||||
|
""" |
||||
|
messages = [ |
||||
|
{ |
||||
|
"role": "system", |
||||
|
"content": "You are a helpful assistant that can get weather information for cities. When asked about weather, use the get_weather tool." |
||||
|
}, |
||||
|
{ |
||||
|
"role": "user", |
||||
|
"content": user_message |
||||
|
} |
||||
|
] |
||||
|
|
||||
|
tools = self.mcp_tools_to_openai_format() |
||||
|
print(f"\nUser: {user_message}\n") |
||||
|
|
||||
|
for iteration in range(max_iterations): |
||||
|
# Call the model |
||||
|
response = self.openai_client.chat.completions.create( |
||||
|
model=self.model, |
||||
|
messages=messages, |
||||
|
tools=tools if tools else None, |
||||
|
tool_choice="auto" if tools else None |
||||
|
) |
||||
|
|
||||
|
assistant_message = response.choices[0].message |
||||
|
|
||||
|
# Add assistant response to messages |
||||
|
messages.append({ |
||||
|
"role": "assistant", |
||||
|
"content": assistant_message.content, |
||||
|
"tool_calls": [ |
||||
|
{ |
||||
|
"id": tc.id, |
||||
|
"type": tc.type, |
||||
|
"function": { |
||||
|
"name": tc.function.name, |
||||
|
"arguments": tc.function.arguments |
||||
|
} |
||||
|
} |
||||
|
for tc in (assistant_message.tool_calls or []) |
||||
|
] |
||||
|
}) |
||||
|
|
||||
|
# Check if model wants to call tools |
||||
|
if assistant_message.tool_calls: |
||||
|
print(f"Assistant wants to call {len(assistant_message.tool_calls)} tool(s):\n") |
||||
|
|
||||
|
# Process each tool call |
||||
|
for tool_call in assistant_message.tool_calls: |
||||
|
function_name = tool_call.function.name |
||||
|
function_args = json.loads(tool_call.function.arguments) |
||||
|
|
||||
|
print(f" Calling tool: {function_name}") |
||||
|
print(f" Arguments: {function_args}") |
||||
|
|
||||
|
# Call the MCP tool |
||||
|
mcp_result = await self.call_mcp_tool(function_name, function_args) |
||||
|
|
||||
|
# Extract content from MCP result |
||||
|
if mcp_result.content: |
||||
|
# Handle different content types |
||||
|
result_text = "" |
||||
|
for content in mcp_result.content: |
||||
|
if hasattr(content, 'text'): |
||||
|
result_text += content.text |
||||
|
else: |
||||
|
result_text += str(content) |
||||
|
|
||||
|
print(f" Result: {result_text}\n") |
||||
|
|
||||
|
# Add tool result to messages |
||||
|
messages.append({ |
||||
|
"role": "tool", |
||||
|
"tool_call_id": tool_call.id, |
||||
|
"content": result_text |
||||
|
}) |
||||
|
else: |
||||
|
print(f" Result: No content returned\n") |
||||
|
messages.append({ |
||||
|
"role": "tool", |
||||
|
"tool_call_id": tool_call.id, |
||||
|
"content": "No result" |
||||
|
}) |
||||
|
|
||||
|
# Continue the loop to get final response |
||||
|
continue |
||||
|
|
||||
|
# No tool calls, this is the final response |
||||
|
if assistant_message.content: |
||||
|
print(f"Assistant: {assistant_message.content}\n") |
||||
|
return assistant_message.content |
||||
|
else: |
||||
|
print("Assistant: (no response)") |
||||
|
return None |
||||
|
|
||||
|
print("\nReached maximum iterations") |
||||
|
return None |
||||
|
|
||||
|
|
||||
|
async def main(): |
||||
|
"""Main function to run the client""" |
||||
|
import sys |
||||
|
|
||||
|
# Get the MCP server script path |
||||
|
script_dir = os.path.dirname(os.path.abspath(__file__)) |
||||
|
project_root = os.path.dirname(script_dir) |
||||
|
mcp_server_path = os.path.join(project_root, "mcp-server", "weather_server.py") |
||||
|
|
||||
|
if not os.path.exists(mcp_server_path): |
||||
|
print(f"Error: MCP server script not found at {mcp_server_path}") |
||||
|
sys.exit(1) |
||||
|
|
||||
|
# Create client |
||||
|
client = MCPWeatherClient() |
||||
|
|
||||
|
# Connect to MCP server and run chat |
||||
|
async with client.connect_to_mcp(mcp_server_path): |
||||
|
# Example question about weather |
||||
|
question = "What's the weather like in Paris?" |
||||
|
|
||||
|
if len(sys.argv) > 1: |
||||
|
question = " ".join(sys.argv[1:]) |
||||
|
|
||||
|
await client.chat(question) |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
asyncio.run(main()) |
||||
@ -0,0 +1,70 @@ |
|||||
|
#!/usr/bin/env python3 |
||||
|
""" |
||||
|
MCP Weather Server - Provides hardcoded weather data for cities via stdio |
||||
|
""" |
||||
|
|
||||
|
from fastmcp import FastMCP |
||||
|
|
||||
|
# Initialize MCP server |
||||
|
mcp = FastMCP("weather-server") |
||||
|
|
||||
|
# Hardcoded weather data |
||||
|
WEATHER_DATA = { |
||||
|
"paris": { |
||||
|
"city": "Paris", |
||||
|
"temperature": 15, |
||||
|
"condition": "Partly cloudy", |
||||
|
"humidity": 65, |
||||
|
"wind_speed": 12 |
||||
|
}, |
||||
|
"london": { |
||||
|
"city": "London", |
||||
|
"temperature": 12, |
||||
|
"condition": "Rainy", |
||||
|
"humidity": 80, |
||||
|
"wind_speed": 18 |
||||
|
}, |
||||
|
"new york": { |
||||
|
"city": "New York", |
||||
|
"temperature": 20, |
||||
|
"condition": "Sunny", |
||||
|
"humidity": 55, |
||||
|
"wind_speed": 10 |
||||
|
}, |
||||
|
"tokyo": { |
||||
|
"city": "Tokyo", |
||||
|
"temperature": 18, |
||||
|
"condition": "Clear", |
||||
|
"humidity": 60, |
||||
|
"wind_speed": 8 |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
@mcp.tool() |
||||
|
def get_weather(city: str) -> dict: |
||||
|
""" |
||||
|
Get the current weather for a specific city. |
||||
|
|
||||
|
Args: |
||||
|
city: The name of the city to get weather for |
||||
|
|
||||
|
Returns: |
||||
|
A dictionary containing weather information including temperature, |
||||
|
condition, humidity, and wind speed |
||||
|
""" |
||||
|
city_lower = city.lower() |
||||
|
|
||||
|
if city_lower in WEATHER_DATA: |
||||
|
return WEATHER_DATA[city_lower] |
||||
|
else: |
||||
|
return { |
||||
|
"city": city, |
||||
|
"error": f"Weather data not available for {city}", |
||||
|
"available_cities": list(WEATHER_DATA.keys()) |
||||
|
} |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
# Run the MCP server using stdio transport |
||||
|
mcp.run(transport="stdio") |
||||
@ -0,0 +1,5 @@ |
|||||
|
vllm>=0.6.0 |
||||
|
openai>=1.0.0 |
||||
|
fastmcp>=0.1.0 |
||||
|
mcp>=1.0.0 |
||||
|
huggingface-hub>=0.20.0 |
||||
@ -0,0 +1,35 @@ |
|||||
|
#!/bin/bash |
||||
|
|
||||
|
# Demo script to run the MCP-enabled LLM client |
||||
|
# Make sure the vLLM server is running first! |
||||
|
|
||||
|
cd "$(dirname "$0")" |
||||
|
|
||||
|
# Activate virtual environment if it exists |
||||
|
if [ -d "venv" ]; then |
||||
|
source venv/bin/activate |
||||
|
fi |
||||
|
|
||||
|
# Check if vLLM server is running |
||||
|
if ! curl -s http://127.0.0.1:8000/health > /dev/null 2>&1; then |
||||
|
echo "⚠️ Warning: vLLM server doesn't seem to be running on port 8000" |
||||
|
echo "Please start it first with: ./start_server.sh" |
||||
|
echo "" |
||||
|
read -p "Continue anyway? (y/n) " -n 1 -r |
||||
|
echo |
||||
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then |
||||
|
exit 1 |
||||
|
fi |
||||
|
fi |
||||
|
|
||||
|
# Run the client |
||||
|
echo "Starting MCP client..." |
||||
|
echo "" |
||||
|
|
||||
|
if [ $# -eq 0 ]; then |
||||
|
# Default question |
||||
|
python client/mcp_client.py "What's the weather like in Paris?" |
||||
|
else |
||||
|
# Custom question |
||||
|
python client/mcp_client.py "$@" |
||||
|
fi |
||||
@ -0,0 +1,30 @@ |
|||||
|
#!/bin/bash |
||||
|
|
||||
|
set -e |
||||
|
|
||||
|
# Vérification de Python |
||||
|
if ! command -v python3 &> /dev/null; then |
||||
|
echo "❌ Python 3 n'est pas installé" |
||||
|
exit 1 |
||||
|
fi |
||||
|
|
||||
|
PYTHON_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[:2])))') |
||||
|
echo "✓ Python version: $PYTHON_VERSION" |
||||
|
|
||||
|
# Création de l'environnement virtuel |
||||
|
if [ ! -d "venv" ]; then |
||||
|
echo "📦 Création de l'environnement virtuel..." |
||||
|
python3 -m venv venv |
||||
|
fi |
||||
|
|
||||
|
# Activation de l'environnement |
||||
|
source venv/bin/activate |
||||
|
|
||||
|
# Installation des dépendances |
||||
|
echo "📥 Installation des dépendances..." |
||||
|
pip install --upgrade pip |
||||
|
pip install -r requirements.txt |
||||
|
|
||||
|
echo "" |
||||
|
echo "✅ Installation terminée !" |
||||
|
echo "" |
||||
@ -0,0 +1,25 @@ |
|||||
|
#!/bin/bash |
||||
|
|
||||
|
set -Eeuo pipefail |
||||
|
|
||||
|
# Start vLLM server with Qwen model |
||||
|
# This script starts the vLLM OpenAI-compatible API server |
||||
|
|
||||
|
MODEL="${1:-Qwen/Qwen3-8B}" |
||||
|
HOST="${2:-127.0.0.1}" |
||||
|
PORT="${3:-8000}" |
||||
|
|
||||
|
echo "Starting vLLM server..." |
||||
|
echo "Model: $MODEL" |
||||
|
echo "Host: $HOST" |
||||
|
echo "Port: $PORT" |
||||
|
echo "Hugging Face Token: $HF_TOKEN" |
||||
|
echo "" |
||||
|
echo "Note: This will download the model if not already cached." |
||||
|
echo "Press Ctrl+C to stop the server." |
||||
|
echo "" |
||||
|
|
||||
|
mkdir -p ~/.cache/vllm |
||||
|
|
||||
|
# see https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm |
||||
|
podman run --name vllm --rm --device nvidia.com/gpu=all -v ~/.cache/huggingface:/root/.cache/huggingface -v ~/.cache/vllm:/root/.cache/vllm --env "HF_TOKEN=$HF_TOKEN" -p $HOST:$PORT:8000 --ipc=host docker.io/vllm/vllm-openai:latest "$MODEL" --gpu-memory-utilization 0.95 --max-model-len 16384 --served-model-name qwen --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1 |
||||
Loading…
Reference in new issue