diff --git a/config/tool/tool-audio-os.yaml b/config/tool/tool-audio-os.yaml new file mode 100644 index 0000000..bf33f6a --- /dev/null +++ b/config/tool/tool-audio-os.yaml @@ -0,0 +1,9 @@ +name: "tool-audio-os" +tool_command: "python" +args: + - "-m" + - "src.tool.mcp_servers.audio_mcp_server_os" +env: + WHISPER_API_KEY: "${oc.env:WHISPER_API_KEY}" + WHISPER_BASE_URL: "${oc.env:WHISPER_BASE_URL}" + WHISPER_MODEL_NAME: "${oc.env:WHISPER_MODEL_NAME}" \ No newline at end of file diff --git a/config/tool/tool-image-video-os.yaml b/config/tool/tool-image-video-os.yaml new file mode 100644 index 0000000..f9a61df --- /dev/null +++ b/config/tool/tool-image-video-os.yaml @@ -0,0 +1,9 @@ +name: "tool-image-video-os" +tool_command: "python" +args: + - "-m" + - "src.tool.mcp_servers.vision_mcp_server_os" +env: + VISION_API_KEY: "${oc.env:VISION_API_KEY}" + VISION_BASE_URL: "${oc.env:VISION_BASE_URL}" + VISION_MODEL_NAME: "${oc.env:VISION_MODEL_NAME}" \ No newline at end of file diff --git a/config/tool/tool-reasoning-os.yaml b/config/tool/tool-reasoning-os.yaml new file mode 100644 index 0000000..c845d45 --- /dev/null +++ b/config/tool/tool-reasoning-os.yaml @@ -0,0 +1,9 @@ +name: "tool-reasoning-os" +tool_command: "python" +args: + - "-m" + - "src.tool.mcp_servers.reasoning_mcp_server_os" +env: + REASONING_API_KEY: "${oc.env:REASONING_API_KEY}" + REASONING_BASE_URL: "${oc.env:REASONING_BASE_URL}" + REASONING_MODEL_NAME: "${oc.env:REASONING_MODEL_NAME}" \ No newline at end of file diff --git a/docs/mkdocs/docs/tool_audio_os.md b/docs/mkdocs/docs/tool_audio_os.md new file mode 100644 index 0000000..e803846 --- /dev/null +++ b/docs/mkdocs/docs/tool_audio_os.md @@ -0,0 +1,149 @@ +# Audio Tools - Open Source (`audio_mcp_server_os.py`) + +The Audio MCP Server (Open Source) enables audio transcription using open-source Whisper models. It provides comprehensive audio-to-text conversion with support for multiple audio formats, local files, and URLs. + +!!! info "Available Functions" + This MCP server provides the following functions that agents can call: + + - **Audio Transcription**: High-quality speech-to-text conversion + - **Multi-Format Support**: MP3, WAV, M4A, AAC, OGG, FLAC, WMA formats + - **Flexible Input**: Local file paths and web URLs + - **Open-Source Model Support**: Whisper-Large-v3-Turbo with automatic processing + +--- + +## Environment Variables + +!!! warning "Configuration Location" + The `audio_mcp_server_os.py` reads environment variables that are passed through the `tool-audio-os.yaml` configuration file, not directly from `.env` file. + +**Open-Source Model Configuration:** + +- `WHISPER_API_KEY`: Required API key for the open-source Whisper service +- `WHISPER_BASE_URL`: Base URL for the Whisper service API endpoint +- `WHISPER_MODEL_NAME`: Model name (default: `openai/whisper-large-v3-turbo`) + +**Example Configuration:** +```bash +# API for Open-Source Audio Transcription Tool (for benchmark testing) +WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo" +WHISPER_API_KEY=your_whisper_key +WHISPER_BASE_URL="https://your_whisper_base_url/v1" +``` + +--- + +## Local Deployment + +### Using vLLM Server + +For optimal performance with the Whisper-Large-v3-Turbo model, deploy using vLLM: + +```bash +pip install vllm==0.10.0 +pip install vllm[audio] +vllm serve /path/to/whisper \ + --served-model-name whisper-large-v3-turbo \ + --task transcription +``` + +### Configuration for Local Deployment + +When using local deployment, configure your environment variables: + +```bash +WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo" +WHISPER_API_KEY="dummy_key" # Not required for local deployment +WHISPER_BASE_URL="http://localhost:8000/v1" +``` + +--- + +## Function Reference + +The following function is provided by the `audio_mcp_server_os.py` MCP tool and can be called by agents: + +### `audio_transcription(audio_path_or_url: str)` + +Transcribe audio files to text using open-source Whisper models. Supports both local files and web URLs with automatic format detection and processing. + +**Parameters:** + +- `audio_path_or_url`: Local file path (accessible to server) or web URL + +**Returns:** + +- `str`: The transcription of the audio file + +**Supported Audio Formats:** +- MP3 (.mp3) +- WAV (.wav) +- M4A (.m4a) +- AAC (.aac) +- OGG (.ogg) +- FLAC (.flac) +- WMA (.wma) + +## Usage Examples + +### Local File Transcription +```python +# Local file transcription +result = audio_transcription( + audio_path_or_url="/path/to/audio.mp3" +) +``` + +### URL-based Transcription +```python +# URL transcription +result = audio_transcription( + audio_path_or_url="https://example.com/audio.wav" +) +``` + +### Meeting Recording Transcription +```python +result = audio_transcription( + audio_path_or_url="meeting_recording.m4a" +) +``` + +### Podcast Transcription +```python +result = audio_transcription( + audio_path_or_url="podcast_episode.mp3" +) +``` + +--- + +## Technical Implementation + +### Audio Processing Pipeline + +1. **Input Validation**: Checks if input is local file or URL +2. **Format Detection**: Determines audio format from extension or content type +3. **File Handling**: Downloads URL files to temporary storage with proper extensions +4. **API Request**: Sends audio file to Whisper model for transcription +5. **Cleanup**: Removes temporary files after processing +6. **Response Processing**: Returns transcription text + +### Error Handling + +- **File Access Errors**: Graceful handling of inaccessible local files +- **Network Errors**: Robust URL fetching with retry logic (up to 3 attempts) +- **Format Errors**: Automatic format detection and validation +- **API Errors**: Clear error reporting for service issues +- **Sandbox Restrictions**: Prevents access to sandbox files with clear error messages + +### Retry Logic + +- **Maximum Retries**: 3 attempts for failed requests +- **Exponential Backoff**: 5, 10, 20 second delays between retries +- **Network Resilience**: Handles temporary network issues and service unavailability + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI diff --git a/docs/mkdocs/docs/tool_reasoning_os.md b/docs/mkdocs/docs/tool_reasoning_os.md new file mode 100644 index 0000000..6d34aca --- /dev/null +++ b/docs/mkdocs/docs/tool_reasoning_os.md @@ -0,0 +1,135 @@ +# Reasoning Tools - Open Source (`reasoning_mcp_server_os.py`) + +The Reasoning MCP Server (Open Source) provides a **pure text-based reasoning engine** using open-source models. It supports logical analysis, problem solving, and planning, with robust retry mechanisms and exponential backoff for reliability. + +!!! info "Available Functions" + This MCP server provides the following functions that agents can call: + + - **Pure Text Reasoning**: Logical analysis and problem solving using open-source LLM backends + - **Step-by-Step Analysis**: Structured reasoning with detailed explanations + - **Open-Source Model Support**: Qwen3-235B-A22B-Thinking-2507 with automatic fallback + - **Robust Error Handling**: Exponential backoff retry logic (up to 10 attempts) + +--- + +## Environment Variables + +!!! warning "Configuration Location" + The `reasoning_mcp_server_os.py` reads environment variables that are passed through the `tool-reasoning-os.yaml` configuration file, not directly from `.env` file. + +**Open-Source Model Configuration:** + +- `REASONING_API_KEY`: Required API key for the open-source reasoning service +- `REASONING_BASE_URL`: Base URL for the reasoning service API endpoint +- `REASONING_MODEL_NAME`: Model name (default: `Qwen/Qwen3-235B-A22B-Thinking-2507`) + +**Example Configuration:** +```bash +# API for Open-Source Reasoning Tool (for benchmark testing) +REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507" +REASONING_API_KEY=your_reasoning_key +REASONING_BASE_URL="https://your_reasoning_base_url/v1/chat/completions" +``` + +--- + +## Local Deployment + +### Using SGLang Server + +For optimal performance with the Qwen3-235B-A22B-Thinking model, deploy using SGLang: + +```bash +python3 -m sglang.launch_server \ + --model-path /path/to/Qwen3-235B-A22B-Thinking-2507 \ + --tp 8 --host 0.0.0.0 --port 1234 \ + --trust-remote-code --enable-metrics \ + --log-level debug --log-level-http debug \ + --log-requests --log-requests-level 2 \ + --show-time-cost --context-length 131072 +``` + +### Configuration for Local Deployment + +When using local deployment, configure your environment variables: + +```bash +REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507" +REASONING_API_KEY="dummy_key" # Not required for local deployment +REASONING_BASE_URL="http://localhost:1234/v1/chat/completions" +``` + +--- + +## Function Reference + +The following function is provided by the `reasoning_mcp_server_os.py` MCP tool and can be called by agents: + +### `reasoning(question: str)` + +Perform step-by-step reasoning, analysis, and planning over a **text-only input**. This tool is specialized for **complex thinking tasks** that require deep analytical reasoning. + +!!! note "Text-Only Processing" + This tool processes only the provided text input and will not fetch external data or context. Ensure all necessary information is included in the question. + +**Parameters:** + +- `question`: A detailed, complex question or problem statement that includes all necessary information + +**Returns:** + +- `str`: A structured, step-by-step reasoned answer + +**Features:** + +- **Open-Source Model**: Uses Qwen3-235B-A22B-Thinking-2507 for advanced reasoning +- **Robust Retry Logic**: Exponential backoff retry mechanism (up to 10 attempts) +- **Thinking Mode Support**: Automatically extracts reasoning content from thinking blocks +- **Error Handling**: Graceful fallback with informative error messages +- **Timeout Protection**: 600-second timeout for long-running reasoning tasks +- **Jittered Backoff**: Prevents thundering herd problems with randomized retry delays + +**Retry Configuration:** +- Maximum retries: 10 attempts +- Initial backoff: 1.0 seconds +- Maximum backoff: 30.0 seconds +- Exponential backoff with jitter (0.8-1.2x multiplier) + +--- + +## Usage Examples + +### Complex Mathematical Problems +```python +question = """ +Solve this complex optimization problem: +A company wants to minimize costs while maximizing production. +Given constraints: 2x + 3y ≤ 100, x + y ≤ 50, x ≥ 0, y ≥ 0 +Cost function: C = 5x + 8y +Production function: P = 3x + 4y +Find the optimal values of x and y. +""" +``` + +### Logical Puzzles +```python +question = """ +Three people are in a room: Alice, Bob, and Charlie. +- Alice says: "Bob is lying" +- Bob says: "Charlie is lying" +- Charlie says: "Alice is lying" +If exactly one person is telling the truth, who is it? +""" +``` + +### Strategic Planning +```python +question = """ +Design a strategy for a startup to enter a competitive market +with limited resources. Consider market analysis, competitive +positioning, resource allocation, and risk mitigation. +""" +``` + +!!! info "Documentation Info" + **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI diff --git a/docs/mkdocs/docs/tool_vqa_os.md b/docs/mkdocs/docs/tool_vqa_os.md new file mode 100644 index 0000000..d308bbc --- /dev/null +++ b/docs/mkdocs/docs/tool_vqa_os.md @@ -0,0 +1,149 @@ +# Vision Tools - Open Source (`vision_mcp_server_os.py`) + +The Vision MCP Server (Open Source) enables Visual Question Answering (VQA) over images using open-source vision-language models. It provides comprehensive image analysis with support for local files and URLs. + +!!! info "Available Functions" + This MCP server provides the following functions that agents can call: + + - **Visual Question Answering**: Comprehensive image analysis and question answering + - **Multi-Format Support**: JPEG, PNG, GIF image formats + - **Flexible Input**: Local file paths and web URLs + - **Open-Source Model Support**: Qwen2.5-VL-72B-Instruct with automatic encoding + +--- + +## Environment Variables + +!!! warning "Configuration Location" + The `vision_mcp_server_os.py` reads environment variables that are passed through the `tool-image-video-os.yaml` configuration file, not directly from `.env` file. + +**Open-Source Model Configuration:** + +- `VISION_API_KEY`: Required API key for the open-source vision service +- `VISION_BASE_URL`: Base URL for the vision service API endpoint +- `VISION_MODEL_NAME`: Model name (default: `Qwen/Qwen2.5-VL-72B-Instruct`) + +**Example Configuration:** +```bash +# API for Open-Source VQA Tool (for benchmark testing) +VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct" +VISION_API_KEY=your_vision_key +VISION_BASE_URL="https://your_vision_base_url/v1/chat/completions" +``` + +--- + +## Local Deployment + +### Using SGLang Server + +For optimal performance with the Qwen2.5-VL-72B-Instruct model, deploy using SGLang (suggested SGLang version is `0.5.2`, as lower versions have potential issues with the model): + +```bash +python3 -m sglang.launch_server \ + --model-path /path/to/Qwen2.5-VL-72B-Instruct \ + --tp 8 --host 0.0.0.0 --port 1234 \ + --trust-remote-code --enable-metrics \ + --log-level debug --log-level-http debug \ + --log-requests --log-requests-level 2 --show-time-cost +``` + +### Configuration for Local Deployment + +When using local deployment, configure your environment variables: + +```bash +VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct" +VISION_API_KEY="dummy_key" # Not required for local deployment +VISION_BASE_URL="http://localhost:1234/v1/chat/completions" +``` + +--- + +## Function Reference + +The following function is provided by the `vision_mcp_server_os.py` MCP tool and can be called by agents: + +### `visual_question_answering(image_path_or_url: str, question: str)` + +Ask questions about images using open-source vision-language models. Supports both local files and web URLs with automatic format detection and encoding. + +**Parameters:** + +- `image_path_or_url`: Local file path (accessible to server) or web URL +- `question`: The user's question about the image + +**Returns:** + +- `str`: The model's answer to the image-related question + +**Supported Image Formats:** +- JPEG (.jpg, .jpeg) +- PNG (.png) +- GIF (.gif) +- Default fallback to JPEG for unknown formats + +## Usage Examples + +### Image Analysis +```python +# Local file analysis +result = visual_question_answering( + image_path_or_url="/path/to/image.jpg", + question="What objects can you see in this image?" +) + +# URL analysis +result = visual_question_answering( + image_path_or_url="https://example.com/image.png", + question="Describe the scene in detail." +) +``` + +### OCR and Text Extraction +```python +result = visual_question_answering( + image_path_or_url="document.jpg", + question="Extract all the text from this document." +) +``` + +### Object Detection and Counting +```python +result = visual_question_answering( + image_path_or_url="scene.jpg", + question="Count how many people are in this image and describe their activities." +) +``` + +### Technical Diagram Analysis +```python +result = visual_question_answering( + image_path_or_url="diagram.png", + question="Explain this technical diagram and identify the key components." +) +``` + +--- + +## Technical Implementation + +### Image Processing Pipeline + +1. **Input Validation**: Checks if input is local file or URL +2. **Format Detection**: Determines MIME type from extension or headers +3. **Encoding**: Converts images to Base64 for API transmission +4. **API Request**: Sends structured request to vision model +5. **Response Processing**: Extracts and returns model response + +### Error Handling + +- **File Access Errors**: Graceful handling of inaccessible local files +- **Network Errors**: Robust URL fetching with proper error messages +- **Format Errors**: Fallback MIME type detection for unknown formats +- **API Errors**: Clear error reporting for service issues + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 376bf4b..a6db094 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -65,7 +65,10 @@ nav: - Overview: tool_overview.md - Tools: - tool-reasoning: tool_reasoning.md + - tool-reasoning-os: tool_reasoning_os.md - tool-image-video: tool_vqa.md + - tool-image-video-os: tool_vqa_os.md + - tool-audio-os: tool_audio_os.md - tool-searching: tool_searching.md - tool-python: tool_python.md - Advanced Features: diff --git a/src/tool/mcp_servers/audio_mcp_server_os.py b/src/tool/mcp_servers/audio_mcp_server_os.py new file mode 100644 index 0000000..1b59d98 --- /dev/null +++ b/src/tool/mcp_servers/audio_mcp_server_os.py @@ -0,0 +1,213 @@ +# Copyright 2025 Miromind.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import base64 +import contextlib +import mimetypes +import os +import tempfile +import wave +from urllib.parse import urlparse + +import requests +from fastmcp import FastMCP +from mutagen import File as MutagenFile +from openai import OpenAI + +WHISPER_API_KEY = os.environ.get("WHISPER_API_KEY") +WHISPER_BASE_URL = os.environ.get("WHISPER_BASE_URL") +WHISPER_MODEL_NAME = os.environ.get("WHISPER_MODEL_NAME") + +# Initialize FastMCP server +mcp = FastMCP("audio-mcp-server-os") + + +def _get_audio_extension(url: str, content_type: str = None) -> str: + """ + Determine the appropriate audio file extension from URL or content type. + + Args: + url: The URL of the audio file + content_type: The content type from HTTP headers + + Returns: + File extension (with dot) to use for temporary file + """ + # First try to get extension from URL + parsed_url = urlparse(url) + path = parsed_url.path.lower() + + # Common audio extensions + audio_extensions = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".wma"] + for ext in audio_extensions: + if path.endswith(ext): + return ext + + # If no extension found in URL, try content type + if content_type: + content_type = content_type.lower() + if "mp3" in content_type or "mpeg" in content_type: + return ".mp3" + elif "wav" in content_type: + return ".wav" + elif "m4a" in content_type: + return ".m4a" + elif "aac" in content_type: + return ".aac" + elif "ogg" in content_type: + return ".ogg" + elif "flac" in content_type: + return ".flac" + + # Default fallback to mp3 + return ".mp3" + + +def _get_audio_duration(audio_path: str) -> float: + """ + Get audio duration in seconds. + + Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc). + """ + # Try using wave for .wav files + try: + with contextlib.closing(wave.open(audio_path, "rb")) as f: + frames = f.getnframes() + rate = f.getframerate() + duration = frames / float(rate) + if duration > 0: + return duration + except Exception: + pass # Not a wav file or failed + + # Try using mutagen for other audio formats (mp3, etc) + try: + audio = MutagenFile(audio_path) + if ( + audio is not None + and hasattr(audio, "info") + and hasattr(audio.info, "length") + ): + duration = float(audio.info.length) + if duration > 0: + return duration + except Exception as e: + return f"[ERROR]: Failed to get audio duration: {e}" + + +def _encode_audio_file(audio_path: str) -> tuple[str, str]: + """Encode audio file to base64 and determine format.""" + with open(audio_path, "rb") as audio_file: + audio_data = audio_file.read() + encoded_string = base64.b64encode(audio_data).decode("utf-8") + + # Determine file format from file extension + mime_type, _ = mimetypes.guess_type(audio_path) + if mime_type and mime_type.startswith("audio/"): + mime_format = mime_type.split("/")[-1] + # Map MIME type formats to OpenAI supported formats + format_mapping = { + "mpeg": "mp3", # audio/mpeg -> mp3 + "wav": "wav", # audio/wav -> wav + "wave": "wav", # audio/wave -> wav + } + file_format = format_mapping.get(mime_format, "mp3") + else: + # Default to mp3 if we can't determine + file_format = "mp3" + + return encoded_string, file_format + + +@mcp.tool() +async def audio_transcription(audio_path_or_url: str) -> str: + """ + Transcribe audio file to text and return the transcription. + Args: + audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported. + + Returns: + The transcription of the audio file. + """ + max_retries = 3 + retry = 0 + transcription = None + + while retry < max_retries: + try: + client = OpenAI(base_url=WHISPER_BASE_URL, api_key=WHISPER_API_KEY) + if os.path.exists(audio_path_or_url): # Check if the file exists locally + with open(audio_path_or_url, "rb") as audio_file: + transcription = client.audio.transcriptions.create( + model=WHISPER_MODEL_NAME, file=audio_file + ) + elif "home/user" in audio_path_or_url: + return "[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction" + else: + # download the audio file from the URL + response = requests.get(audio_path_or_url) + response.raise_for_status() # Raise an exception for bad status codes + + # Basic content validation - check if response has content + if not response.content: + return ( + "[ERROR]: Audio transcription failed: Downloaded file is empty" + ) + + # Check content type if available + content_type = response.headers.get("content-type", "").lower() + if content_type and not any( + media_type in content_type + for media_type in ["audio", "video", "application/octet-stream"] + ): + return f"[ERROR]: Audio transcription failed: Invalid content type '{content_type}'. Expected audio file." + + # Get proper extension for the temporary file + file_extension = _get_audio_extension(audio_path_or_url, content_type) + + # Use proper temporary file handling with correct extension + with tempfile.NamedTemporaryFile( + delete=False, suffix=file_extension + ) as temp_file: + temp_file.write(response.content) + temp_audio_path = temp_file.name + + try: + with open(temp_audio_path, "rb") as audio_file: + transcription = client.audio.transcriptions.create( + model=WHISPER_MODEL_NAME, file=audio_file + ) + finally: + # Clean up the temp file + if os.path.exists(temp_audio_path): + os.remove(temp_audio_path) + break + + except requests.RequestException as e: + retry += 1 + if retry >= max_retries: + return f"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported." + await asyncio.sleep(5 * (2**retry)) + except Exception as e: + retry += 1 + if retry >= max_retries: + return f"[ERROR]: Audio transcription failed: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported." + await asyncio.sleep(5 * (2**retry)) + + return transcription.text + + +if __name__ == "__main__": + mcp.run(transport="stdio") \ No newline at end of file diff --git a/src/tool/mcp_servers/reasoning_mcp_server_os.py b/src/tool/mcp_servers/reasoning_mcp_server_os.py new file mode 100644 index 0000000..3e886a7 --- /dev/null +++ b/src/tool/mcp_servers/reasoning_mcp_server_os.py @@ -0,0 +1,103 @@ +# Copyright 2025 Miromind.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import random +import time + +import requests +from fastmcp import FastMCP + +logger = logging.getLogger("miroflow") + +REASONING_API_KEY = os.environ.get("REASONING_API_KEY") +REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL") +REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME") + +# Initialize FastMCP server +mcp = FastMCP("reasoning-mcp-server-os") + +# Retry configuration +MAX_RETRIES = 10 +BACKOFF_BASE = 1.0 # initial backoff in seconds +BACKOFF_MAX = 30.0 # maximum backoff in seconds + + +def post_with_retry(url, json, headers): + """Send POST request with retry and exponential backoff. + Returns response object if success, otherwise None.""" + for attempt in range(1, MAX_RETRIES + 1): + try: + resp = requests.post(url, json=json, headers=headers, timeout=600) + if resp.status_code == 200: + return resp + else: + logger.warning( + f"HTTP {resp.status_code} on attempt {attempt}: {resp.text[:200]}" + ) + except requests.exceptions.RequestException as e: + logger.warning(f"Request failed on attempt {attempt}: {e}") + + # Backoff before next retry + if attempt < MAX_RETRIES: + sleep_time = min(BACKOFF_BASE * (2 ** (attempt - 1)), BACKOFF_MAX) + # Add jitter to avoid thundering herd + sleep_time *= 0.8 + 0.4 * random.random() + logger.info(f"Retrying in {sleep_time:.1f}s...") + time.sleep(sleep_time) + + logger.warning(f"All {MAX_RETRIES} retries failed for {url}") + return None + + +@mcp.tool() +async def reasoning(question: str) -> str: + """You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts. + DO NOT use this tool for simple and obvious question. + + Args: + question: The hard question. + + Returns: + The answer to the question. + """ + payload = { + "model": REASONING_MODEL_NAME, + "messages": [{"role": "user", "content": question}], + "temperature": 0.6, + "top_p": 0.95, + } + headers = { + "Authorization": f"Bearer {REASONING_API_KEY}", + "Content-Type": "application/json", + } + + response = post_with_retry(REASONING_BASE_URL, json=payload, headers=headers) + if response is None: + return "Reasoning service unavailable. Please try again later." + + json_response = response.json() + try: + content = json_response["choices"][0]["message"]["content"] + if "" in content: + content = content.split("", 1)[1].strip() + return content + except Exception: + logger.info("Reasoning Error: only thinking content is returned") + return json_response["choices"][0]["message"]["reasoning_content"] + + +if __name__ == "__main__": + mcp.run(transport="stdio") \ No newline at end of file diff --git a/src/tool/mcp_servers/vision_mcp_server_os.py b/src/tool/mcp_servers/vision_mcp_server_os.py new file mode 100644 index 0000000..786e5af --- /dev/null +++ b/src/tool/mcp_servers/vision_mcp_server_os.py @@ -0,0 +1,112 @@ +# Copyright 2025 Miromind.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import os + +import aiohttp +import requests +from fastmcp import FastMCP + +VISION_API_KEY = os.environ.get("VISION_API_KEY") +VISION_BASE_URL = os.environ.get("VISION_BASE_URL") +VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME") + +# Initialize FastMCP server +mcp = FastMCP("vision-mcp-server-os") + + +def guess_mime_media_type_from_extension(file_path: str) -> str: + """Guess the MIME type based on the file extension.""" + _, ext = os.path.splitext(file_path) + ext = ext.lower() + if ext in [".jpg", ".jpeg"]: + return "image/jpeg" + elif ext == ".png": + return "image/png" + elif ext == ".gif": + return "image/gif" + else: + return "image/jpeg" # Default to JPEG if unknown + + +@mcp.tool() +async def visual_question_answering(image_path_or_url: str, question: str) -> str: + """Ask question about an image or a video and get the answer with a vision language model. + + Args: + image_path_or_url: The path of the image file locally or its URL. + question: The question to ask about the image. + + Returns: + The answer to the image-related question. + """ + messages_for_llm = [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": None}}, + { + "type": "text", + "text": question, + }, + ], + } + ] + + headers = { + "Authorization": f"Bearer {VISION_API_KEY}", + "Content-Type": "application/json", + } + + try: + if os.path.exists(image_path_or_url): # Check if the file exists locally + with open(image_path_or_url, "rb") as image_file: + image_data = base64.b64encode(image_file.read()).decode("utf-8") + mime_type = guess_mime_media_type_from_extension(image_path_or_url) + messages_for_llm[0]["content"][0]["image_url"]["url"] = ( + f"data:{mime_type};base64,{image_data}" + ) + elif image_path_or_url.startswith(("http://", "https://")): + async with aiohttp.ClientSession() as session: + async with session.get(image_path_or_url) as resp: + if resp.status == 200: + image_bytes = await resp.read() + mime_type = resp.headers.get( + "Content-Type", "image/png" + ) # fallback MIME type + image_data = base64.b64encode(image_bytes).decode("utf-8") + messages_for_llm[0]["content"][0]["image_url"]["url"] = ( + f"data:{mime_type};base64,{image_data}" + ) + else: + return f"Failed to fetch image from URL: {image_path_or_url}" + else: + messages_for_llm[0]["content"][0]["image_url"]["url"] = image_path_or_url + + payload = {"model": VISION_MODEL_NAME, "messages": messages_for_llm} + + response = requests.post(VISION_BASE_URL, json=payload, headers=headers) + + except Exception as e: + return f"Error: {e}" + + try: + return response.json()["choices"][0]["message"]["content"] + except (AttributeError, IndexError): + return response.json() + + +if __name__ == "__main__": + mcp.run(transport="stdio") \ No newline at end of file