From d3eda6addbe8a2c538c2ec735ee42444d92a0ca3 Mon Sep 17 00:00:00 2001 From: Yue Deng Date: Mon, 13 Oct 2025 15:42:13 +0800 Subject: [PATCH 1/2] add miroapi support --- docs/mkdocs/docs/miro_api.md | 24 +++ docs/mkdocs/mkdocs.yml | 1 + .../mcp_servers/miroapi_serper_mcp_server.py | 165 ++++++++++++++++++ src/tool/mcp_servers/searching_mcp_server.py | 46 ++++- src/tool/mcp_servers/utils/smart_request.py | 30 ++-- src/tool/mcp_servers/utils/url_unquote.py | 113 ++++++++++++ 6 files changed, 361 insertions(+), 18 deletions(-) create mode 100644 docs/mkdocs/docs/miro_api.md create mode 100644 src/tool/mcp_servers/miroapi_serper_mcp_server.py create mode 100644 src/tool/mcp_servers/utils/url_unquote.py diff --git a/docs/mkdocs/docs/miro_api.md b/docs/mkdocs/docs/miro_api.md new file mode 100644 index 0000000..d01e477 --- /dev/null +++ b/docs/mkdocs/docs/miro_api.md @@ -0,0 +1,24 @@ +# MiroAPI + +!!! warning "Preview Documentation" + This service is currently in preview and limited to internal access. Public release will follow once it is production-ready. + +## Overview +MiroAPI provides an internal caching layer for Serper Search and Jina Scrape to reduce costs, speed up development, and enable reproducible "go-back-in-time" sandbox runs by serving recorded results when available. + +### Step 1: Apply for a MiroAPI key + Request a MiroAPI key through the internal portal. + +### Step 2: Configure .env +``` +# API for Google Search (recommended) +SERPER_API_KEY="svc-miro-api01-replace-with-your-key" +SERPER_BASE_URL="https://miro-api.miromind.site/serper" + +# API for Web Scraping (recommended) +JINA_API_KEY="svc-miro-api01-replace-with-your-key" +JINA_BASE_URL="https://miro-api.miromind.site/jina" +``` + + + \ No newline at end of file diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 7f9f784..60e0ff3 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -74,6 +74,7 @@ nav: - tool-python: tool_python.md - Advanced Features: - E2B Advanced Features: e2b_advanced_features.md + - MiroAPI: miro_api.md - Add New Tools: contribute_tools.md - LLM Clients: diff --git a/src/tool/mcp_servers/miroapi_serper_mcp_server.py b/src/tool/mcp_servers/miroapi_serper_mcp_server.py new file mode 100644 index 0000000..95fd25b --- /dev/null +++ b/src/tool/mcp_servers/miroapi_serper_mcp_server.py @@ -0,0 +1,165 @@ +# Copyright 2025 Miromind.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +adapted from +https://github.com/MiroMindAI/MiroRL/blob/5073693549ffe05a157a1886e87650ef3be6606e/mirorl/tools/serper_search.py#L1 +""" + +import os +from typing import Any, Dict + +import requests +from mcp.server.fastmcp import FastMCP +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from .utils.url_unquote import decode_http_urls_in_dict + +SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev") +SERPER_API_KEY = os.getenv("SERPER_API_KEY", "") + + +# Initialize FastMCP server +mcp = FastMCP("serper-mcp-server") + + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type( + (requests.ConnectionError, requests.Timeout, requests.HTTPError) + ), +) +def make_serper_request( + payload: Dict[str, Any], headers: Dict[str, str] +) -> requests.Response: + """Make HTTP request to Serper API with retry logic.""" + response = requests.post(f"{SERPER_BASE_URL}/search", json=payload, headers=headers) + response.raise_for_status() + return response + + +def _is_huggingface_dataset_or_space_url(url): + """ + Check if the URL is a HuggingFace dataset or space URL. + :param url: The URL to check + :return: True if it's a HuggingFace dataset or space URL, False otherwise + """ + if not url: + return False + return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url + + +@mcp.tool() +def google_search( + q: str, + gl: str = "us", + hl: str = "en", + location: str | None = None, + num: int | None = None, + tbs: str | None = None, + page: int | None = None, + autocorrect: bool | None = None, +) -> Dict[str, Any]: + """ + Tool to perform web searches via Serper API and retrieve rich results. + + It is able to retrieve organic search results, people also ask, + related searches, and knowledge graph. + + Args: + q: Search query string + gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us') + hl: Optional language code for search results in ISO 639-1 format (e.g., 'en') + location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States') + num: Number of results to return (default: 10) + tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, + 'qdr:m' for past month, 'qdr:y' for past year) + page: Page number of results to return (default: 1) + autocorrect: Whether to autocorrect spelling in query + + Returns: + Dictionary containing search results and metadata. + """ + # Check for API key + if not SERPER_API_KEY: + return { + "success": False, + "error": "SERPER_API_KEY environment variable not set", + "results": [], + } + + # Validate required parameter + if not q or not q.strip(): + return { + "success": False, + "error": "Search query 'q' is required and cannot be empty", + "results": [], + } + + try: + # Build payload with all supported parameters + payload: dict[str, Any] = { + "q": q.strip(), + "gl": gl, + "hl": hl, + } + + # Add optional parameters if provided + if location: + payload["location"] = location + if num is not None: + payload["num"] = num + else: + payload["num"] = 10 # Default + if tbs: + payload["tbs"] = tbs + if page is not None: + payload["page"] = page + if autocorrect is not None: + payload["autocorrect"] = autocorrect + + # Set up headers + headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"} + + # Make the API request + response = make_serper_request(payload, headers) + data = response.json() + + # filter out HuggingFace dataset or space urls + organic_results = [] + if "organic" in data: + for item in data["organic"]: + if _is_huggingface_dataset_or_space_url(item.get("link", "")): + continue + organic_results.append(item) + + # Keep all original fields, but overwrite "organic" + response_data = dict(data) + response_data["organic"] = organic_results + response_data = decode_http_urls_in_dict(response_data) + + return response_data + + except Exception as e: + return {"success": False, "error": f"Unexpected error: {str(e)}", "results": []} + + +if __name__ == "__main__": + mcp.run() diff --git a/src/tool/mcp_servers/searching_mcp_server.py b/src/tool/mcp_servers/searching_mcp_server.py index c8566d5..4187e4a 100644 --- a/src/tool/mcp_servers/searching_mcp_server.py +++ b/src/tool/mcp_servers/searching_mcp_server.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import sys import os import json import requests @@ -17,7 +18,11 @@ SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "") +SERPER_BASE_URL = os.environ.get("SERPER_BASE_URL", "https://google.serper.dev") JINA_API_KEY = os.environ.get("JINA_API_KEY", "") +JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai") + +IS_MIRO_API = True if "miro" in SERPER_BASE_URL or "miro" in JINA_BASE_URL else False # Google search result filtering environment variables REMOVE_SNIPPETS = os.environ.get("REMOVE_SNIPPETS", "").lower() in ("true", "1", "yes") @@ -122,11 +127,18 @@ async def google_search( arguments["location"] = location if tbs: arguments["tbs"] = tbs - server_params = StdioServerParameters( - command="npx", - args=["-y", "serper-search-scrape-mcp-server"], - env={"SERPER_API_KEY": SERPER_API_KEY}, - ) + if IS_MIRO_API: + server_params = StdioServerParameters( + command=sys.executable, + args=["-m", "src.tool.mcp_servers.miroapi_serper_mcp_server"], + env={"SERPER_API_KEY": SERPER_API_KEY, "SERPER_BASE_URL": SERPER_BASE_URL}, + ) + else: + server_params = StdioServerParameters( + command="npx", + args=["-y", "serper-search-scrape-mcp-server"], + env={"SERPER_API_KEY": SERPER_API_KEY}, + ) result_content = "" retry_count = 0 max_retries = 5 @@ -348,7 +360,12 @@ async def search_wiki_revision( content = await smart_request( url=base_url, params=params, - env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY}, + env={ + "SERPER_API_KEY": SERPER_API_KEY, + "JINA_API_KEY": JINA_API_KEY, + "SERPER_BASE_URL": SERPER_BASE_URL, + "JINA_BASE_URL": JINA_BASE_URL, + }, ) data = request_to_json(content) @@ -527,6 +544,8 @@ async def search_archived_webpage(url: str, year: int, month: int, day: int) -> env={ "SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY, + "SERPER_BASE_URL": SERPER_BASE_URL, + "JINA_BASE_URL": JINA_BASE_URL, }, ) data = request_to_json(content) @@ -585,7 +604,12 @@ async def search_archived_webpage(url: str, year: int, month: int, day: int) -> content = await smart_request( url=base_url, params={"url": url}, - env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY}, + env={ + "SERPER_API_KEY": SERPER_API_KEY, + "JINA_API_KEY": JINA_API_KEY, + "SERPER_BASE_URL": SERPER_BASE_URL, + "JINA_BASE_URL": JINA_BASE_URL, + }, ) data = request_to_json(content) if "archived_snapshots" in data and "closest" in data["archived_snapshots"]: @@ -664,7 +688,13 @@ async def scrape_website(url: str) -> str: """ # TODO: Long Content Handling return await smart_request( - url, env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY} + url, + env={ + "SERPER_API_KEY": SERPER_API_KEY, + "JINA_API_KEY": JINA_API_KEY, + "SERPER_BASE_URL": SERPER_BASE_URL, + "JINA_BASE_URL": JINA_BASE_URL, + }, ) diff --git a/src/tool/mcp_servers/utils/smart_request.py b/src/tool/mcp_servers/utils/smart_request.py index 0696e29..927e533 100644 --- a/src/tool/mcp_servers/utils/smart_request.py +++ b/src/tool/mcp_servers/utils/smart_request.py @@ -13,6 +13,8 @@ import urllib.parse from markitdown import MarkItDown import io +from typing import Optional +import os def request_to_json(content: str) -> dict: @@ -30,6 +32,7 @@ async def smart_request(url: str, params: dict = None, env: dict = None) -> str: if env: JINA_API_KEY = env.get("JINA_API_KEY", "") SERPER_API_KEY = env.get("SERPER_API_KEY", "") + JINA_BASE_URL = env.get("JINA_BASE_URL", "https://r.jina.ai") else: JINA_API_KEY = "" SERPER_API_KEY = "" @@ -37,6 +40,8 @@ async def smart_request(url: str, params: dict = None, env: dict = None) -> str: if JINA_API_KEY == "" and SERPER_API_KEY == "": return "[ERROR]: JINA_API_KEY and SERPER_API_KEY are not set, smart_request is not available." + IS_MIRO_API = True if "miro" in JINA_BASE_URL else False + # Auto-add https:// if no protocol is specified protocol_hint = "" if not url.startswith(("http://", "https://")): @@ -65,7 +70,7 @@ async def smart_request(url: str, params: dict = None, env: dict = None) -> str: ): youtube_hint = "[NOTE]: If you need to get information about its visual or audio content, please use tool 'visual_audio_youtube_analyzing' instead. This tool may not be able to provide visual and audio content of a YouTube Video.\n\n" - content, jina_err = await scrape_jina(url, JINA_API_KEY) + content, jina_err = await scrape_jina(url, JINA_API_KEY, JINA_BASE_URL) if jina_err: error_msg += f"Failed to get content from Jina.ai: {jina_err}\n" elif content is None or content.strip() == "": @@ -73,13 +78,16 @@ async def smart_request(url: str, params: dict = None, env: dict = None) -> str: else: return protocol_hint + youtube_hint + content - content, serper_err = await scrape_serper(url, SERPER_API_KEY) - if serper_err: - error_msg += f"Failed to get content from SERPER: {serper_err}\n" - elif content is None or content.strip() == "": - error_msg += "No content got from SERPER.\n" - else: - return protocol_hint + youtube_hint + content + if not IS_MIRO_API: + # Try Serper API for scraping if not using Miro API + # (Miro API does not support caching Serper scraping results) + content, serper_err = await scrape_serper(url, SERPER_API_KEY) + if serper_err: + error_msg += f"Failed to get content from SERPER: {serper_err}\n" + elif content is None or content.strip() == "": + error_msg += "No content got from SERPER.\n" + else: + return protocol_hint + youtube_hint + content content, request_err = scrape_request(url) if request_err: @@ -99,7 +107,9 @@ async def smart_request(url: str, params: dict = None, env: dict = None) -> str: await asyncio.sleep(4**retry_count) -async def scrape_jina(url: str, jina_api_key: str) -> tuple[str, str]: +async def scrape_jina( + url: str, jina_api_key: str, jina_base_url: str +) -> tuple[str, str]: # Use Jina.ai reader API to convert URL to LLM-friendly text if jina_api_key == "": return ( @@ -116,7 +126,7 @@ async def scrape_jina(url: str, jina_api_key: str) -> tuple[str, str]: "X-With-Shadow-Dom": "true", } - jina_url = f"https://r.jina.ai/{url}" + jina_url = f"{jina_base_url}/{url}" try: response = requests.get(jina_url, headers=jina_headers, timeout=120) if response.status_code == 422: diff --git a/src/tool/mcp_servers/utils/url_unquote.py b/src/tool/mcp_servers/utils/url_unquote.py new file mode 100644 index 0000000..c6cdb31 --- /dev/null +++ b/src/tool/mcp_servers/utils/url_unquote.py @@ -0,0 +1,113 @@ +import re +from urllib.parse import unquote + +from markdown_it import MarkdownIt + +# Reserved character encodings to be protected -> temporary placeholders +PROTECT = { + "%2F": "__SLASH__", + "%2f": "__SLASH__", + "%3F": "__QMARK__", + "%3f": "__QMARK__", + "%23": "__HASH__", + "%26": "__AMP__", + "%3D": "__EQUAL__", + "%20": "__SPACE__", + "%2B": "__PLUS__", + "%25": "__PERCENT__", +} + +# Reverse mapping: placeholder -> original %xx (use uppercase for uniform output) +RESTORE = {v: k.upper() for k, v in PROTECT.items()} + + +def safe_unquote(s: str, encoding="utf-8", errors="ignore") -> str: + # 1. Replace with placeholders + for k, v in PROTECT.items(): + s = s.replace(k, v) + # 2. Decode (only affects unprotected parts, e.g., Chinese characters) + s = unquote(s, encoding=encoding, errors=errors) + # 3. Replace placeholders back to original %xx + for v, k in RESTORE.items(): + s = s.replace(v, k) + return s + + +def decode_http_urls_in_dict(data): + """ + Traverse all values in the data structure: + - If it's a string starting with http, apply urllib.parse.unquote + - If it's a list, recursively process each element + - If it's a dict, recursively process each value + - Other types remain unchanged + """ + if isinstance(data, str): + if "%" in data: + return safe_unquote(data) + else: + return data + elif isinstance(data, list): + return [decode_http_urls_in_dict(item) for item in data] + elif isinstance(data, dict): + return {key: decode_http_urls_in_dict(value) for key, value in data.items()} + else: + return data + + +md = MarkdownIt("commonmark") + + +def strip_markdown_links(markdown: str) -> str: + tokens = md.parse(markdown) + + def render(ts): + out = [] + for tok in ts: + t = tok.type + + # 1) Links: drop the wrapper, keep inner text (children will be rendered) + if t == "link_open" or t == "link_close": + continue + + # 2) Images: skip the entire image block + if t == "image": + continue + + # 3) Line breaks and block closings + if t == "softbreak": # inline single line break + out.append("\n") + continue + if ( + t == "hardbreak" + ): # explicit line break (two spaces + newline in Markdown) + out.append("\n") + continue + if t in ("paragraph_close", "heading_close", "blockquote_close"): + out.append("\n\n") + continue + if t in ("list_item_close", "bullet_list_close", "ordered_list_close"): + out.append("\n") + continue + if t == "hr": + out.append("\n\n") + continue + + # 4) Inline or nested tokens + if tok.children: + out.append(render(tok.children)) + continue + + # Preserve inline code style + if t == "code_inline": + out.append(f"`{tok.content}`") + else: + out.append(tok.content or "") + + return "".join(out) + + text = render(tokens) + + # normalize excessive blank lines (avoid more than 2 consecutive newlines) + text = re.sub(r"\n{3,}", "\n\n", text).rstrip() + "\n" + + return text.strip() From b3cefe532f4799223eaa3d8d16b612cd40d51db5 Mon Sep 17 00:00:00 2001 From: Yue Deng Date: Mon, 13 Oct 2025 15:46:14 +0800 Subject: [PATCH 2/2] lint code --- src/tool/mcp_servers/utils/smart_request.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/tool/mcp_servers/utils/smart_request.py b/src/tool/mcp_servers/utils/smart_request.py index 927e533..728856a 100644 --- a/src/tool/mcp_servers/utils/smart_request.py +++ b/src/tool/mcp_servers/utils/smart_request.py @@ -13,8 +13,6 @@ import urllib.parse from markitdown import MarkItDown import io -from typing import Optional -import os def request_to_json(content: str) -> dict: