Skip to content

Commit a92f270

Browse files
authored
Merge pull request #13 from ScrapeGraphAI/fix/json-schema-required-field
Enhance output schema documentation and validation in server.py
2 parents 3579efb + 633f801 commit a92f270

File tree

2 files changed

+881
-857
lines changed

2 files changed

+881
-857
lines changed

src/scrapegraph_mcp/server.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -837,8 +837,9 @@ def parameter_reference_guide() -> str:
837837
- **Type**: `Optional[Union[str, Dict[str, Any]]]`
838838
- **Purpose**: Define expected output structure
839839
- **Formats**:
840-
- Dictionary: `{'type': 'object', 'properties': {'title': {'type': 'string'}}}`
841-
- JSON string: `'{"type": "object", "properties": {"name": {"type": "string"}}}'`
840+
- Dictionary: `{'type': 'object', 'properties': {'title': {'type': 'string'}}, 'required': []}`
841+
- JSON string: `'{"type": "object", "properties": {"name": {"type": "string"}}, "required": []}'`
842+
- **IMPORTANT**: Must include a `"required"` field (can be empty array `[]` if no fields are required)
842843
- **Examples**:
843844
```json
844845
{
@@ -852,16 +853,20 @@ def parameter_reference_guide() -> str:
852853
"name": {"type": "string"},
853854
"price": {"type": "number"},
854855
"available": {"type": "boolean"}
855-
}
856+
},
857+
"required": []
856858
}
857859
}
858-
}
860+
},
861+
"required": []
859862
}
860863
```
861864
- **Best Practices**:
865+
- Always include the `"required"` field (use `[]` if no fields are required)
862866
- Use for complex, structured extractions
863867
- Define clear data types
864868
- Consider nested structures for complex data
869+
- Note: If `"required"` field is missing, it will be automatically added as `[]`
865870
866871
---
867872
@@ -1432,8 +1437,7 @@ def smartscraper(
14321437
Extract structured data from a webpage, HTML, or markdown using AI-powered extraction.
14331438
14341439
This tool uses advanced AI to understand your natural language prompt and extract specific
1435-
structured data from web content. Supports three input modes: URL scraping, local HTML processing,
1436-
or local markdown processing. Ideal for extracting product information, contact details,
1440+
structured data from web content. Supports three input modes: URL scraping. Ideal for extracting product information, contact details,
14371441
article metadata, or any structured content. Costs 10 credits per page. Read-only operation.
14381442
14391443
Args:
@@ -1480,10 +1484,13 @@ def smartscraper(
14801484
- Can be provided as a dictionary or JSON string
14811485
- Helps ensure consistent, structured output format
14821486
- Optional but recommended for complex extractions
1487+
- IMPORTANT: Must include a "required" field (can be empty array [] if no fields are required)
14831488
- Examples:
1484-
* As dict: {'type': 'object', 'properties': {'title': {'type': 'string'}, 'price': {'type': 'number'}}}
1485-
* As JSON string: '{"type": "object", "properties": {"name": {"type": "string"}}}'
1486-
* For arrays: {'type': 'array', 'items': {'type': 'object', 'properties': {...}}}
1489+
* As dict: {'type': 'object', 'properties': {'title': {'type': 'string'}, 'price': {'type': 'number'}}, 'required': []}
1490+
* As JSON string: '{"type": "object", "properties": {"name": {"type": "string"}}, "required": []}'
1491+
* For arrays: {'type': 'array', 'items': {'type': 'object', 'properties': {...}, 'required': []}, 'required': []}
1492+
* With required fields: {'type': 'object', 'properties': {'name': {'type': 'string'}, 'email': {'type': 'string'}}, 'required': ['name', 'email']}
1493+
- Note: If "required" field is missing, it will be automatically added as an empty array []
14871494
- Default: None (AI will infer structure from prompt)
14881495
14891496
number_of_scrolls (Optional[int]): Number of infinite scrolls to perform before scraping.
@@ -1564,6 +1571,11 @@ def smartscraper(
15641571
except json.JSONDecodeError as e:
15651572
return {"error": f"Invalid JSON for output_schema: {str(e)}"}
15661573

1574+
# Ensure output_schema has a 'required' field if it exists
1575+
if normalized_schema is not None:
1576+
if "required" not in normalized_schema:
1577+
normalized_schema["required"] = []
1578+
15671579
return client.smartscraper(
15681580
user_prompt=user_prompt,
15691581
website_url=website_url,
@@ -2099,11 +2111,14 @@ def agentic_scrapper(
20992111
- Can be provided as a dictionary or JSON string
21002112
- Defines the format and structure of the final extracted data
21012113
- Helps ensure consistent, predictable output format
2114+
- IMPORTANT: Must include a "required" field (can be empty array [] if no fields are required)
21022115
- Examples:
2103-
* Simple object: {'type': 'object', 'properties': {'title': {'type': 'string'}, 'price': {'type': 'number'}}}
2104-
* Array of objects: {'type': 'array', 'items': {'type': 'object', 'properties': {'name': {'type': 'string'}, 'value': {'type': 'string'}}}}
2105-
* Complex nested: {'type': 'object', 'properties': {'products': {'type': 'array', 'items': {...}}, 'total_count': {'type': 'number'}}}
2106-
* As JSON string: '{"type": "object", "properties": {"results": {"type": "array"}}}'
2116+
* Simple object: {'type': 'object', 'properties': {'title': {'type': 'string'}, 'price': {'type': 'number'}}, 'required': []}
2117+
* Array of objects: {'type': 'array', 'items': {'type': 'object', 'properties': {'name': {'type': 'string'}, 'value': {'type': 'string'}}, 'required': []}, 'required': []}
2118+
* Complex nested: {'type': 'object', 'properties': {'products': {'type': 'array', 'items': {...}}, 'total_count': {'type': 'number'}}, 'required': []}
2119+
* As JSON string: '{"type": "object", "properties": {"results": {"type": "array"}}, "required": []}'
2120+
* With required fields: {'type': 'object', 'properties': {'id': {'type': 'string'}, 'name': {'type': 'string'}}, 'required': ['id']}
2121+
- Note: If "required" field is missing, it will be automatically added as an empty array []
21072122
- Default: None (agent will infer structure from prompt and steps)
21082123
21092124
steps (Optional[Union[str, List[str]]]): Step-by-step instructions for the agent.
@@ -2245,6 +2260,11 @@ def agentic_scrapper(
22452260
except json.JSONDecodeError as e:
22462261
return {"error": f"Invalid JSON for output_schema: {str(e)}"}
22472262

2263+
# Ensure output_schema has a 'required' field if it exists
2264+
if normalized_schema is not None:
2265+
if "required" not in normalized_schema:
2266+
normalized_schema["required"] = []
2267+
22482268
try:
22492269
api_key = get_api_key(ctx)
22502270
client = ScapeGraphClient(api_key)
@@ -2280,8 +2300,10 @@ def create_server() -> FastMCP:
22802300
def main() -> None:
22812301
"""Run the ScapeGraph MCP server."""
22822302
try:
2283-
logger.info("Starting ScapeGraph MCP server!")
2284-
print("Starting ScapeGraph MCP server!")
2303+
# Verify we're running from local codebase
2304+
server_path = os.path.abspath(__file__)
2305+
logger.info(f"Starting ScapeGraph MCP server from local codebase: {server_path}")
2306+
print(f"Starting ScapeGraph MCP server (local codebase)")
22852307
mcp.run(transport="stdio")
22862308
except Exception as e:
22872309
logger.error(f"Failed to start MCP server: {e}")

0 commit comments

Comments
 (0)