diff --git a/README.md b/README.md index 057d8ed..8035078 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,11 @@
-[![DOCS](https://img.shields.io/badge/Documentation-4285F4?style=for-the-badge&logo=gitbook&logoColor=white)](https://miromindai.github.io/MiroFlow/) [![DEMO](https://img.shields.io/badge/Demo-FFB300?style=for-the-badge&logo=airplayvideo&logoColor=white)](https://dr.miromind.ai/) [![MODELS](https://img.shields.io/badge/Models-5EDDD2?style=for-the-badge&logo=huggingface&logoColor=ffffff&labelColor)](https://huggingface.co/collections/miromind-ai/mirothinker-v02-68af084a18035f57b17cd902) [![DATA](https://img.shields.io/badge/Data-0040A1?style=for-the-badge&logo=huggingface&logoColor=ffffff&labelColor)](https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1) - [![BLOG](https://img.shields.io/badge/Blog-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://miromind.ai/blog/miroflow) + [![GITHUB](https://img.shields.io/badge/Github-24292F?style=for-the-badge&logo=github&logoColor=white)](https://github.com/MiroMindAI) [![DISCORD](https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.com/invite/GPqEnkzQZd) [![WeChat](https://img.shields.io/badge/WeChat-07C160?style=for-the-badge&logo=wechat&logoColor=white)](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/wechat.png) @@ -22,7 +21,9 @@
-### πŸš€ [Try our Demo!](https://dr.miromind.ai/)|[δΈ­ζ–‡](README_zh.md)|[ζ—₯本θͺž](README_ja.md) +## πŸ“š **[READ THE DOCUMENTATION](https://miromindai.github.io/MiroFlow/)** + +### πŸš€ [Try Demo](https://dr.miromind.ai/) | [δΈ­ζ–‡](README_zh.md) | [ζ—₯本θͺž](README_ja.md)
diff --git a/config/agent_browsecomp-en_claude37sonnet.yaml b/config/agent_browsecomp-en_claude37sonnet.yaml new file mode 100644 index 0000000..baa3e15 --- /dev/null +++ b/config/agent_browsecomp-en_claude37sonnet.yaml @@ -0,0 +1,78 @@ +defaults: + - benchmark: browsecomp-en + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: true + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: + agent-worker: + prompt_class: SubAgentWorkerPrompt + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/agent_mirothinker.yaml b/config/agent_browsecomp-en_mirothinker.yaml similarity index 75% rename from config/agent_mirothinker.yaml rename to config/agent_browsecomp-en_mirothinker.yaml index 709eeed..63bca34 100644 --- a/config/agent_mirothinker.yaml +++ b/config/agent_browsecomp-en_mirothinker.yaml @@ -1,35 +1,37 @@ defaults: - - benchmark: gaia-validation + - benchmark: browsecomp-en - override hydra/job_logging: none - _self_ # Allow defining variables at the top of this file main_agent: - prompt_class: MainAgentPromptBoxedAnswer + prompt_class: MainAgentPrompt_GAIA llm: provider_class: "MiroThinkerSGLangClient" - model_name: "MODEL_NAME" + model_name: "DUMMY_MODEL_NAME" async_client: true - temperature: 0.6 + temperature: 0.3 top_p: 0.95 min_p: 0.0 top_k: -1 - max_tokens: 8192 + max_tokens: 4096 oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}" oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}" keep_tool_result: -1 oai_tool_thinking: false - tool_config: [] + tool_config: + - tool-reasoning - max_turns: -1 # Maximum number of turns for main agent execution + max_turns: 50 # Maximum number of turns for main agent execution max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn input_process: hint_generation: false hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: - final_answer_extraction: false + final_answer_extraction: true final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction @@ -43,25 +45,31 @@ sub_agents: prompt_class: SubAgentWorkerPrompt llm: provider_class: "MiroThinkerSGLangClient" - model_name: "MODEL_NAME" + model_name: "DUMMY_MODEL_NAME" async_client: true - temperature: 0.6 - top_p: 0.95 + temperature: 0.3 + top_p: 1.0 min_p: 0.0 top_k: -1 - max_tokens: 8192 + max_tokens: 4096 oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}" oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}" keep_tool_result: -1 oai_tool_thinking: false tool_config: + - tool-searching + - tool-image-video - tool-reading + - tool-code + - tool-audio - max_turns: -1 # Maximum number of turns for main agent execution + max_turns: 50 # Maximum number of turns for main agent execution max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn # Can define some top-level or default parameters here output_dir: logs/ -data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored \ No newline at end of file +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/agent_finsearchcomp_claude37sonnet.yaml b/config/agent_finsearchcomp_claude37sonnet.yaml new file mode 100644 index 0000000..70cc22b --- /dev/null +++ b/config/agent_finsearchcomp_claude37sonnet.yaml @@ -0,0 +1,77 @@ +defaults: + - benchmark: finsearchcomp + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.6 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + + max_turns: 20 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: true + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: + agent-worker: + prompt_class: SubAgentWorkerPrompt + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.6 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: 20 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/agent_finsearchcomp.yaml b/config/agent_finsearchcomp_mirothinker.yaml similarity index 96% rename from config/agent_finsearchcomp.yaml rename to config/agent_finsearchcomp_mirothinker.yaml index 16225e6..df0b56c 100644 --- a/config/agent_finsearchcomp.yaml +++ b/config/agent_finsearchcomp_mirothinker.yaml @@ -8,7 +8,7 @@ main_agent: prompt_class: MainAgentPrompt_GAIA llm: provider_class: "MiroThinkerSGLangClient" - model_name: "MODEL_NAME" + model_name: "DUMMY_MODEL_NAME" async_client: true temperature: 0.6 top_p: 0.95 @@ -44,7 +44,7 @@ sub_agents: prompt_class: SubAgentWorkerPrompt llm: provider_class: "MiroThinkerSGLangClient" - model_name: "MODEL_NAME" + model_name: "DUMMY_MODEL_NAME" async_client: true temperature: 0.6 top_p: 0.95 @@ -69,3 +69,5 @@ sub_agents: # Can define some top-level or default parameters here output_dir: logs/ data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/agent_gaia-validation-text-only_mirothinker.yaml b/config/agent_gaia-validation-text-only_mirothinker.yaml index 4faec6a..a813115 100644 --- a/config/agent_gaia-validation-text-only_mirothinker.yaml +++ b/config/agent_gaia-validation-text-only_mirothinker.yaml @@ -8,7 +8,7 @@ main_agent: prompt_class: MainAgentPrompt_GAIA llm: provider_class: "MiroThinkerSGLangClient" - model_name: "MODEL_NAME" + model_name: "DUMMY_MODEL_NAME" async_client: true temperature: 0.3 top_p: 0.95 @@ -45,7 +45,7 @@ sub_agents: prompt_class: SubAgentWorkerPrompt llm: provider_class: "MiroThinkerSGLangClient" - model_name: "anthropic/claude-3.7-sonnet" + model_name: "DUMMY_MODEL_NAME" async_client: true temperature: 0.3 top_p: 1.0 diff --git a/config/agent_gaia-validation-text-only_mirothinker_single_agent.yaml b/config/agent_gaia-validation-text-only_mirothinker_single_agent.yaml new file mode 100644 index 0000000..b9e92e6 --- /dev/null +++ b/config/agent_gaia-validation-text-only_mirothinker_single_agent.yaml @@ -0,0 +1,53 @@ +defaults: + - benchmark: gaia-validation-text-only + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "MiroThinkerSGLangClient" + model_name: "MODEL_NAME" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 4096 + oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}" + oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}" + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: false + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: null + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + diff --git a/config/agent_llm_claude37sonnet.yaml b/config/agent_llm_claude37sonnet.yaml new file mode 100644 index 0000000..59261b7 --- /dev/null +++ b/config/agent_llm_claude37sonnet.yaml @@ -0,0 +1,53 @@ +defaults: + - benchmark: example_dataset + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPromptBoxedAnswer + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reading + - tool-searching + + max_turns: 20 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: false + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: false + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: null + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + + diff --git a/config/agent_llm_claude37sonnet_anthropic.yaml b/config/agent_llm_claude37sonnet_anthropic.yaml new file mode 100644 index 0000000..a57851e --- /dev/null +++ b/config/agent_llm_claude37sonnet_anthropic.yaml @@ -0,0 +1,51 @@ +defaults: + - benchmark: example_dataset + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPromptBoxedAnswer + llm: + provider_class: "ClaudeAnthropicClient" + model_name: "claude-3-7-sonnet-20250219" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 16000 + anthropic_api_key: "${oc.env:ANTHROPIC_API_KEY,???}" + anthropic_base_url: "${oc.env:ANTHROPIC_BASE_URL,https://api.anthropic.com}" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reading + - tool-searching + + max_turns: 20 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: false + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: false + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: null + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/agent_llm_gpt4o.yaml b/config/agent_llm_gpt4o.yaml new file mode 100644 index 0000000..a94656a --- /dev/null +++ b/config/agent_llm_gpt4o.yaml @@ -0,0 +1,50 @@ +defaults: + - benchmark: example_dataset + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPromptBoxedAnswer + llm: + provider_class: "GPTOpenAIClient" + model_name: "gpt-4o" + async_client: true + temperature: 0.7 + top_p: 1.0 + min_p: 0.0 + top_k: -1 + max_tokens: 4096 + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reading + - tool-searching + + max_turns: 20 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: false + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: false + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: null + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/agent_llm_gpt5.yaml b/config/agent_llm_gpt5.yaml new file mode 100644 index 0000000..5223b77 --- /dev/null +++ b/config/agent_llm_gpt5.yaml @@ -0,0 +1,51 @@ +defaults: + - benchmark: example_dataset + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPromptBoxedAnswer + llm: + provider_class: "GPT5OpenAIClient" + model_name: "gpt-5" + async_client: true + temperature: 1.0 + top_p: 1.0 + min_p: 0.0 + top_k: -1 + max_tokens: 4096 + reasoning_effort: "high" # Use high in the main agent, and use the default medium in the sub-agent. + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reading + - tool-searching + + max_turns: 20 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: false + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: false + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: null + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/agent_llm_mirothinker.yaml b/config/agent_llm_mirothinker.yaml new file mode 100644 index 0000000..afc855b --- /dev/null +++ b/config/agent_llm_mirothinker.yaml @@ -0,0 +1,49 @@ +defaults: + - benchmark: example_dataset + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPromptBoxedAnswer + llm: + provider_class: "MiroThinkerSGLangClient" + model_name: "DUMMY_MODEL_NAME" + async_client: true + temperature: 0.6 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 8192 + oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}" + oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}" + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reading + - tool-searching + + max_turns: 20 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: false + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: false + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: null + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + diff --git a/config/agent_quickstart_reading.yaml b/config/agent_quickstart_reading.yaml index e016162..ed3b69f 100644 --- a/config/agent_quickstart_reading.yaml +++ b/config/agent_quickstart_reading.yaml @@ -1,5 +1,5 @@ defaults: - - benchmark: gaia-validation + - benchmark: example_dataset - override hydra/job_logging: none - _self_ # Allow defining variables at the top of this file diff --git a/config/agent_quickstart_search.yaml b/config/agent_quickstart_search.yaml index 96df9e5..a027d2b 100644 --- a/config/agent_quickstart_search.yaml +++ b/config/agent_quickstart_search.yaml @@ -1,5 +1,5 @@ defaults: - - benchmark: gaia-validation + - benchmark: example_dataset - override hydra/job_logging: none - _self_ # Allow defining variables at the top of this file diff --git a/config/agent_quickstart_single_agent.yaml b/config/agent_quickstart_single_agent.yaml index 068da69..39d7068 100644 --- a/config/agent_quickstart_single_agent.yaml +++ b/config/agent_quickstart_single_agent.yaml @@ -1,5 +1,5 @@ defaults: - - benchmark: gaia-validation + - benchmark: example_dataset - override hydra/job_logging: none - _self_ # Allow defining variables at the top of this file diff --git a/config/agent_xbench-ds.yaml b/config/agent_xbench-ds_claude37sonnet.yaml similarity index 99% rename from config/agent_xbench-ds.yaml rename to config/agent_xbench-ds_claude37sonnet.yaml index 6b5213f..aeaac08 100644 --- a/config/agent_xbench-ds.yaml +++ b/config/agent_xbench-ds_claude37sonnet.yaml @@ -75,3 +75,4 @@ sub_agents: output_dir: logs/ data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + diff --git a/config/benchmark/browsecomp-en.yaml b/config/benchmark/browsecomp-en.yaml new file mode 100644 index 0000000..98c32bb --- /dev/null +++ b/config/benchmark/browsecomp-en.yaml @@ -0,0 +1,20 @@ +# config/benchmark/browsecomp-en.yaml +defaults: + - default + - _self_ + +name: "browsecomp-en" + +data: + data_dir: "${data_dir}/browsecomp-test" # Path to browsecomp-test (English) dataset + metadata_file: "standardized_data.jsonl" # Metadata filename + whitelist: [] # Optional: List of specific task_ids to run + +execution: + max_tasks: null # null = no limit, or specify a number + max_concurrent: 5 # Number of parallel tasks + pass_at_k: 1 # Number of attempts per task + +# OpenAI API key for evaluation (required for browsecomp since it has ground truth) +openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + diff --git a/config/benchmark/example_dataset.yaml b/config/benchmark/example_dataset.yaml new file mode 100644 index 0000000..36eee7e --- /dev/null +++ b/config/benchmark/example_dataset.yaml @@ -0,0 +1,21 @@ +# config/benchmark/example_dataset.yaml +defaults: + - default + - _self_ + +name: "example_dataset" + +data: + data_dir: "${data_dir}/example_dataset" # Path to example_dataset + metadata_file: "standardized_data.jsonl" # Metadata filename + whitelist: [] # Optional: List of specific task_ids to run + +execution: + max_tasks: null # null = no limit, or specify a number + max_concurrent: 5 # Number of parallel tasks + pass_at_k: 1 # Number of attempts per task + +# OpenAI API key for evaluation (required for example_dataset since it has ground truth) +openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + + diff --git a/docs/mkdocs/docs/browsecomp_en.md b/docs/mkdocs/docs/browsecomp_en.md new file mode 100644 index 0000000..cdde3c6 --- /dev/null +++ b/docs/mkdocs/docs/browsecomp_en.md @@ -0,0 +1,91 @@ +# BrowseComp-EN (English) + +MiroFlow's evaluation on the BrowseComp-EN benchmark demonstrates advanced web browsing and information retrieval capabilities. + +More details: [BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents](https://arxiv.org/abs/2504.12516) + +--- + +## Dataset Overview + +!!! abstract "Key Dataset Characteristics" + + - **Total Tasks**: 1,266 tasks in the test split + - **Language**: English + - **Task Types**: Web browsing, search, and information retrieval + - **Evaluation**: Automated comparison with ground truth answers + +--- + +## Quick Start Guide + +### Step 1: Prepare the BrowseComp-EN Dataset + +```bash title="Download BrowseComp-EN Dataset" +uv run main.py prepare-benchmark get browsecomp-test +``` + +This will create the standardized dataset at `data/browsecomp-test/standardized_data.jsonl`. + +!!! warning "Requires HuggingFace Token" + Add your HuggingFace token to `.env`: `HF_TOKEN="your_token_here"` + +### Step 2: Configure API Keys + +```env title=".env Configuration" +# Search and web scraping +SERPER_API_KEY="xxx" +JINA_API_KEY="xxx" + +# Code execution +E2B_API_KEY="xxx" + +# LLM (Claude 3.7 Sonnet via OpenRouter) +OPENROUTER_API_KEY="xxx" +OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" + +# Evaluation and hint generation +OPENAI_API_KEY="xxx" + +# Vision capabilities +ANTHROPIC_API_KEY="xxx" +GEMINI_API_KEY="xxx" +``` + +### Step 3: Run the Evaluation + +```bash title="Run BrowseComp-EN Evaluation" +uv run main.py common-benchmark --config_file_name=agent_browsecomp-en_claude37sonnet benchmark=browsecomp-en output_dir="logs/browsecomp-en/$(date +"%Y%m%d_%H%M")" +``` + +Results are automatically generated in the output directory: +- `benchmark_results.jsonl` - Detailed results for each task +- `benchmark_results_pass_at_1_accuracy.txt` - Summary accuracy statistics + +--- + +## Usage Examples + +```bash title="Limited Task Testing" +# Test with 10 tasks only +uv run main.py common-benchmark --config_file_name=agent_browsecomp-en_claude37sonnet benchmark=browsecomp-en benchmark.execution.max_tasks=10 output_dir="logs/browsecomp-en/$(date +"%Y%m%d_%H%M")" +``` + +```bash title="Using MiroThinker Model" +uv run main.py common-benchmark --config_file_name=agent_browsecomp-en_mirothinker benchmark=browsecomp-en output_dir="logs/browsecomp-en/$(date +"%Y%m%d_%H%M")" +``` + +--- + +## Available Agent Configurations + +| Agent Configuration | Model | Use Case | +|-------------------|-------|----------| +| `agent_browsecomp-en_claude37sonnet` | Claude 3.7 Sonnet | Recommended for better performance | +| `agent_browsecomp-en_mirothinker` | MiroThinker | For local deployment | + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 Β· **Doc Contributor:** Team @ MiroMind AI + diff --git a/docs/mkdocs/docs/claude-3.7-sonnet.md b/docs/mkdocs/docs/claude-3.7-sonnet.md index 4d71679..d875683 100644 --- a/docs/mkdocs/docs/claude-3.7-sonnet.md +++ b/docs/mkdocs/docs/claude-3.7-sonnet.md @@ -20,18 +20,33 @@ main_agent: llm: provider_class: "ClaudeAnthropicClient" model_name: "claude-3-7-sonnet-20250219" # Use actual model name from Anthropic API + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 anthropic_api_key: "${oc.env:ANTHROPIC_API_KEY,???}" anthropic_base_url: "${oc.env:ANTHROPIC_BASE_URL,https://api.anthropic.com}" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false ``` +!!! tip "Sampling Parameters" + - `min_p` and `top_k` are required in the configuration + - Anthropic API natively supports `top_k`, but `min_p` is not used by the API + - Set `min_p: 0.0` (disabled) and `top_k: -1` (disabled) or a specific value like `top_k: 40` + ## Usage ```bash title="Example Command" -# Use existing config -uv run main.py trace --config_file_name=your_config_file \ - --task="Your task" --task_file_name="data/file.txt" +# Run with Claude 3.7 Sonnet (Anthropic SDK) on example dataset +uv run main.py common-benchmark --config_file_name=agent_llm_claude37sonnet_anthropic output_dir="logs/test" ``` +The `agent_llm_claude37sonnet_anthropic.yaml` configuration file provides a ready-to-use setup with the example dataset benchmark. + --- !!! info "Documentation Info" diff --git a/docs/mkdocs/docs/finsearchcomp.md b/docs/mkdocs/docs/finsearchcomp.md index 925909e..8d2556e 100644 --- a/docs/mkdocs/docs/finsearchcomp.md +++ b/docs/mkdocs/docs/finsearchcomp.md @@ -2,7 +2,7 @@ MiroFlow's evaluation on the FinSearchComp benchmark demonstrates capabilities in financial information search and analysis tasks, showcasing advanced reasoning abilities in complex financial research scenarios. -More details: [FinSearchComp Dataset](https://huggingface.co/datasets/ByteSeedXpert/FinSearchComp) +More details: [FinSearchComp: Towards a Realistic, Expert-Level Evaluation of Financial Search and Reasoning](https://arxiv.org/abs/2509.13160) --- @@ -59,9 +59,9 @@ JINA_API_KEY="xxx" # For Linux sandbox (code execution environment) E2B_API_KEY="xxx" -# We use MiroThinker model for financial analysis -OAI_MIROTHINKER_API_KEY="xxx" -OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1" +# We use Claude 3.7 Sonnet for financial analysis via OpenRouter +OPENROUTER_API_KEY="xxx" +OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" # Used for hint generation and final answer extraction OPENAI_API_KEY="xxx" @@ -80,7 +80,7 @@ GEMINI_API_KEY="xxx" Execute the following command to run evaluation on the FinSearchComp dataset: ```bash title="Run FinSearchComp Evaluation" -uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" +uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" ``` !!! tip "Progress Monitoring and Resume" @@ -93,7 +93,7 @@ uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off. ```bash title="Resume Evaluation, e.g." - uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir=${PATH_TO_LOG} + uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir=${PATH_TO_LOG} ``` ### Step 4: Extract Results @@ -129,7 +129,7 @@ uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark After running evaluations, you'll find the following structure: ``` -logs/finsearchcomp/agent_finsearchcomp_YYYYMMDD_HHMM/ +logs/finsearchcomp/agent_finsearchcomp_claude37sonnet_YYYYMMDD_HHMM/ β”œβ”€β”€ benchmark_results.jsonl # Task results summary β”œβ”€β”€ benchmark_results_pass_at_1_accuracy.txt # Accuracy statistics β”œβ”€β”€ task_(T1)Time_Sensitive_Data_Fetching_*.json # T1 task traces @@ -154,12 +154,12 @@ The progress checker provides detailed statistics: ### Single Run Evaluation ```bash title="Basic Evaluation" -uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" +uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" ``` ### Limited Task Testing ```bash title="Test with Limited Tasks" -uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp benchmark.execution.max_tasks=5 output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" +uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp benchmark.execution.max_tasks=5 output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" ``` ### Custom Agent Configuration diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md index eec7860..97a3da0 100644 --- a/docs/mkdocs/docs/futurex.md +++ b/docs/mkdocs/docs/futurex.md @@ -2,6 +2,9 @@ MiroFlow's evaluation on the Futurex-Online benchmark demonstrates capabilities in future event prediction tasks. +More details: [FutureX: An Advanced Live Benchmark for LLM Agents in Future Prediction](https://arxiv.org/abs/2508.11987) + + --- ## Dataset Overview diff --git a/docs/mkdocs/docs/index.md b/docs/mkdocs/docs/index.md index 42ea809..bdbdb4e 100644 --- a/docs/mkdocs/docs/index.md +++ b/docs/mkdocs/docs/index.md @@ -9,7 +9,29 @@ ## πŸš€ What is MiroFlow? -**MiroFlow** is a comprehensive agentic foundation platform for building intelligent AI agents that achieve state-of-the-art performance on complex tasks. It provides enhanced conversation management, flexible tool integration, and extensive benchmark evaluations across multiple datasets. +**MiroFlow** is an agentic AI platform for building intelligent agents with flexible tool integration and comprehensive benchmark evaluations. + + +## πŸ“ Recent Updates + +!!! success "Latest Changes & Improvements" + + **Oct 2025** - + + - Add support for Index + - Add support for BrowseComp-EN evaluation + - Add support for MiroAPI https://github.com/MiroMindAI/MiroFlow/pull/76 + + + - πŸ“Š Added support for FinSearchComp evaluation benchmark [#51](https://github.com/MiroMindAI/MiroFlow/pull/51) + - πŸ” Added support for XBench-DS (Deep Search) evaluation [#47](https://github.com/MiroMindAI/MiroFlow/pull/47) + - 🧠 Updated o3 hints and summary to more models [#58](https://github.com/MiroMindAI/MiroFlow/pull/58) + - ✨ Added support for GPT-5 integration [#52](https://github.com/MiroMindAI/MiroFlow/pull/52) + - πŸ”§ Improved tool logs and per-task log storage [#69](https://github.com/MiroMindAI/MiroFlow/pull/69) + - πŸ€– Added support for single agent mode [#67](https://github.com/MiroMindAI/MiroFlow/pull/67) + - πŸ“š Added comprehensive collection of agentic AI research papers [#65](https://github.com/MiroMindAI/MiroFlow/pull/65) + + @@ -53,21 +75,6 @@ Explore the complete MiroMind AI ecosystem: | **MiroTrain** | Complete training recipes and tools | [GitHub](https://github.com/MiroMindAI/MiroTrain) :material-arrow-right: | -## πŸ“ Recent Updates - -!!! success "Latest Changes & Improvements" - - **Oct 2025** - - - - πŸ“Š Added support for FinSearchComp evaluation benchmark [#51](https://github.com/MiroMindAI/MiroFlow/pull/51) - - πŸ” Added support for XBench-DS (Deep Search) evaluation [#47](https://github.com/MiroMindAI/MiroFlow/pull/47) - - 🧠 Updated o3 hints and summary to more models [#58](https://github.com/MiroMindAI/MiroFlow/pull/58) - - ✨ Added support for GPT-5 integration [#52](https://github.com/MiroMindAI/MiroFlow/pull/52) - - πŸ”§ Improved tool logs and per-task log storage [#69](https://github.com/MiroMindAI/MiroFlow/pull/69) - - πŸ€– Added support for single agent mode [#67](https://github.com/MiroMindAI/MiroFlow/pull/67) - - πŸ“š Added comprehensive collection of agentic AI research papers [#65](https://github.com/MiroMindAI/MiroFlow/pull/65) - - diff --git a/docs/mkdocs/docs/mirothinker.md b/docs/mkdocs/docs/mirothinker.md index ea7b242..93213d8 100644 --- a/docs/mkdocs/docs/mirothinker.md +++ b/docs/mkdocs/docs/mirothinker.md @@ -56,19 +56,17 @@ OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1" Test your setup with the following command: ```bash title="Test Command" -uv run main.py trace --config_file_name=agent_mirothinker \ - --task="What is the first country listed in the XLSX file that have names starting with Co?" \ - --task_file_name="data/FSI-2023-DOWNLOAD.xlsx" +uv run main.py common-benchmark --config_file_name=agent_llm_mirothinker output_dir="logs/test" ``` This command will: -- Use the `agent_mirothinker` configuration with the dedicated MiroThinkerSGLangClient -- Process the specified Excel file -- Query the model to find countries starting with "Co" +- Use the `agent_llm_mirothinker` configuration with the dedicated MiroThinkerSGLangClient +- Run the example dataset benchmark (configured in the YAML file) +- Test the model's question-answering capabilities ### Configuration Details -The `./config/agent_mirothinker.yaml` configuration file uses: +The `./config/agent_llm_mirothinker.yaml` configuration file uses: - `provider_class: "MiroThinkerSGLangClient"` - A dedicated client for MiroThinker models deployed with SGLang - Model path and generation parameters (temperature, top_p, max_tokens, etc.) diff --git a/docs/mkdocs/docs/openai-gpt.md b/docs/mkdocs/docs/openai-gpt.md deleted file mode 100644 index e111449..0000000 --- a/docs/mkdocs/docs/openai-gpt.md +++ /dev/null @@ -1,78 +0,0 @@ -# OpenAI GPT Models - -OpenAI's latest models including GPT-5, GPT-4o and advanced reasoning models with strong coding, vision, and reasoning capabilities. - -## Client Used for GPT-5 - -`GPT5OpenAIClient` - -### Environment Setup - -```bash title="Environment Variables" -export OPENAI_API_KEY="your-openai-key" -export OPENAI_BASE_URL="https://api.openai.com/v1" # optional -``` - -### Configuration - -```yaml title="Agent Configuration" -main_agent: - llm: - provider_class: "GPT5OpenAIClient" - model_name: "gpt-5" - async_client: true - temperature: 1.0 - top_p: 1.0 - min_p: 0.0 - top_k: -1 - max_tokens: 128000 - reasoning_effort: "high" # Use high in the main agent, and use the default medium in the sub-agent. - openai_api_key: "${oc.env:OPENAI_API_KEY,???}" - openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" -``` - -### Usage - -```bash title="Example Command" -# Create custom OpenAI config -uv run main.py trace --config_file_name=your_config_file \ - --task="Your task" --task_file_name="data/file.txt" -``` - -## Client Used for GPT-4o - -`GPTOpenAIClient` - -### Environment Setup - -```bash title="Environment Variables" -export OPENAI_API_KEY="your-openai-key" -export OPENAI_BASE_URL="https://api.openai.com/v1" # optional -``` - -### Configuration - -```yaml title="Agent Configuration" -main_agent: - llm: - provider_class: "GPTOpenAIClient" - model_name: "gpt-4o" # or gpt-4o-mini, etc. - openai_api_key: "${oc.env:OPENAI_API_KEY,???}" - openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" -``` - -### Usage - -```bash title="Example Command" -# Create custom OpenAI config -uv run main.py trace --config_file_name=your_config_file \ - --task="Your task" --task_file_name="data/file.txt" -``` - -!!! note "Configuration Notes" - - `GPTOpenAIClient` also supports GPT-5, but it has not been fully validated on MiroFlow yet. We recommend using `GPT5OpenAIClient`. - ---- - -!!! info "Documentation Info" - **Last Updated:** October 2025 Β· **Doc Contributor:** Team @ MiroMind AI \ No newline at end of file diff --git a/docs/mkdocs/docs/openai-gpt4o.md b/docs/mkdocs/docs/openai-gpt4o.md new file mode 100644 index 0000000..4bc59bb --- /dev/null +++ b/docs/mkdocs/docs/openai-gpt4o.md @@ -0,0 +1,54 @@ +# OpenAI GPT-4o + +OpenAI's GPT-4o model with multimodal capabilities, strong reasoning, and efficient performance. + +## Client Configuration + +**Client Class**: `GPTOpenAIClient` + +### Environment Setup + +```bash title="Environment Variables" +export OPENAI_API_KEY="your-openai-key" +export OPENAI_BASE_URL="https://api.openai.com/v1" # optional +``` + +### Agent Configuration + +```yaml title="Agent Configuration" +main_agent: + llm: + provider_class: "GPTOpenAIClient" + model_name: "gpt-4o" # or gpt-4o-mini + async_client: true + temperature: 0.7 + top_p: 1.0 + min_p: 0.0 + top_k: -1 + max_tokens: 16000 + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" +``` + +### Usage + +```bash title="Example Command" +# Run with GPT-4o on example dataset +uv run main.py common-benchmark --config_file_name=agent_llm_gpt4o output_dir="logs/test" +``` + +The `agent_llm_gpt4o.yaml` configuration file provides a ready-to-use setup with the example dataset benchmark. + +!!! note "Available Models" + The `GPTOpenAIClient` supports multiple GPT-4o variants: + - `gpt-4o` - Full GPT-4o model + - `gpt-4o-mini` - Smaller, faster variant + +!!! warning "GPT-5 Support" + `GPTOpenAIClient` also supports GPT-5, but it has not been fully validated on MiroFlow yet. We recommend using `GPT5OpenAIClient` for GPT-5. + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 Β· **Doc Contributor:** Team @ MiroMind AI + diff --git a/docs/mkdocs/docs/openai-gpt5.md b/docs/mkdocs/docs/openai-gpt5.md new file mode 100644 index 0000000..4409a7e --- /dev/null +++ b/docs/mkdocs/docs/openai-gpt5.md @@ -0,0 +1,53 @@ +# OpenAI GPT-5 + +OpenAI's GPT-5 model with advanced reasoning capabilities and strong coding, vision, and problem-solving abilities. + +## Client Configuration + +**Client Class**: `GPT5OpenAIClient` + +### Environment Setup + +```bash title="Environment Variables" +export OPENAI_API_KEY="your-openai-key" +export OPENAI_BASE_URL="https://api.openai.com/v1" # optional +``` + +### Agent Configuration + +```yaml title="Agent Configuration" +main_agent: + llm: + provider_class: "GPT5OpenAIClient" + model_name: "gpt-5" + async_client: true + temperature: 1.0 + top_p: 1.0 + min_p: 0.0 + top_k: -1 + max_tokens: 16000 + reasoning_effort: "high" # Use high in the main agent, and use the default medium in the sub-agent. + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" +``` + +### Usage + +```bash title="Example Command" +# Run with GPT-5 on example dataset +uv run main.py common-benchmark --config_file_name=agent_llm_gpt5 output_dir="logs/test" +``` + +The `agent_llm_gpt5.yaml` configuration file provides a ready-to-use setup with the example dataset benchmark. + +!!! tip "Reasoning Effort" + GPT-5 supports the `reasoning_effort` parameter. The configuration uses `"high"` for better reasoning performance. + +!!! tip "Sampling Parameters" + While `min_p` and `top_k` are required in the configuration, OpenAI's API does not use them. Set them to `min_p: 0.0` and `top_k: -1` (disabled). + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 Β· **Doc Contributor:** Team @ MiroMind AI + diff --git a/docs/mkdocs/docs/openrouter-claude-3.7-sonnet.md b/docs/mkdocs/docs/openrouter-claude-3.7-sonnet.md index d932f0b..83c3ddf 100644 --- a/docs/mkdocs/docs/openrouter-claude-3.7-sonnet.md +++ b/docs/mkdocs/docs/openrouter-claude-3.7-sonnet.md @@ -20,27 +20,30 @@ main_agent: llm: provider_class: "ClaudeOpenRouterClient" model_name: "anthropic/claude-3.7-sonnet" # or openai/gpt-4, etc. + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" openrouter_provider: "anthropic" # Force provider, or "" for auto + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false ``` -## Other Supported Models - -- `openai/gpt-4` -- `openai/gpt-3.5-turbo` -- `anthropic/claude-3-opus` -- `google/gemini-pro` -- Many others via unified OpenAI format ## Usage ```bash title="Example Command" -# Use existing OpenRouter config -uv run main.py trace --config_file_name=your_config_file \ - --task="Your task" --task_file_name="data/file.txt" +# Run with Claude 3.7 Sonnet on example dataset +uv run main.py common-benchmark --config_file_name=agent_llm_claude37sonnet output_dir="logs/test" ``` +The `agent_llm_claude37sonnet.yaml` configuration file provides a ready-to-use setup with the example dataset benchmark. + ## Benefits vs Direct API - Unified chat format diff --git a/docs/mkdocs/docs/xbench_ds.md b/docs/mkdocs/docs/xbench_ds.md index 2ab9dc0..c1f9bf7 100644 --- a/docs/mkdocs/docs/xbench_ds.md +++ b/docs/mkdocs/docs/xbench_ds.md @@ -52,7 +52,7 @@ OPENAI_BASE_URL="https://api.openai.com/v1" ```bash uv run main.py common-benchmark \ - --config_file_name=agent_xbench-ds \ + --config_file_name=agent_xbench-ds_claude37sonnet \ output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")" ``` @@ -72,7 +72,7 @@ Replace `$PATH_TO_LOG` with your actual output directory path. ```bash title="Resume Interrupted Evaluation" uv run main.py common-benchmark \ - --config_file_name=agent_xbench-ds \ + --config_file_name=agent_xbench-ds_claude37sonnet \ output_dir="logs/xbench-ds/20250922_1430" ``` diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 60e0ff3..f45f9d7 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -37,62 +37,86 @@ repo_url: https://github.com/MiroMindAI/MiroFlow nav: - - Introduction: + - 🏠 Introduction: - News & Updates: index.md - License: license.md - - Quick Start: + - πŸš€ Quick Start: - Quickstart: quickstart.md - Core Concepts: core_concepts.md - YAML Configuration: yaml_config.md - - Evaluation: + - "πŸ“Š Evaluation": - Overview: evaluation_overview.md - - Benchmarks: - - GAIA-Validation: - - Prerequisites: gaia_validation_prerequisites.md - - Claude-3.7-Sonnet: gaia_validation_claude37sonnet.md - - GPT-5: gaia_validation_gpt5.md - - MiroThinker: gaia_validation_mirothinker.md - - GAIA-Validation-Text-Only: gaia_validation_text_only.md - - GAIA-Test: gaia_test.md - - FutureX: futurex.md - - xBench-DeepSearch: xbench_ds.md - - FinSearchComp: finsearchcomp.md - - Download Datasets: download_datasets.md - - Add New Benchmarks: contribute_benchmarks.md - - - Tools: + - How to add new benchmarks: contribute_benchmarks.md + - "": "" + - "": "" + - "": "" + - "": "" + - GAIA-Val: + - Prepare Dataset: gaia_validation_prerequisites.md + - Claude-3.7-Sonnet: gaia_validation_claude37sonnet.md + - GPT-5: gaia_validation_gpt5.md + - MiroThinker: gaia_validation_mirothinker.md + - GAIA-Val-Text: gaia_validation_text_only.md + - GAIA-Test: gaia_test.md + - BrowseComp-EN: browsecomp_en.md + - FutureX: futurex.md + - xBench-DeepSearch: xbench_ds.md + - FinSearchComp: finsearchcomp.md + + # - Benchmarks: + # - GAIA-Validation-Text-Only: gaia_validation_text_only.md + # - GAIA-Test: gaia_test.md + # - BrowseComp-EN: browsecomp_en.md + # - FutureX: futurex.md + # - xBench-DeepSearch: xbench_ds.md + # - FinSearchComp: finsearchcomp.md + # - Download Datasets: download_datasets.md + + + + - πŸ”§ Tools: - Overview: tool_overview.md - - Tools: - - tool-reasoning: tool_reasoning.md - - tool-reasoning-os: tool_reasoning_os.md - - tool-image-video: tool_vqa.md - - tool-image-video-os: tool_vqa_os.md - - tool-audio-os: tool_audio_os.md - - tool-searching: tool_searching.md - - tool-python: tool_python.md + - How to add new tools: contribute_tools.md + - "": "" + - "": "" + - "": "" + - "": "" + - tool-reasoning: tool_reasoning.md + - tool-reasoning-os: tool_reasoning_os.md + - tool-image-video: tool_vqa.md + - tool-image-video-os: tool_vqa_os.md + - tool-audio-os: tool_audio_os.md + - tool-searching: tool_searching.md + - tool-python: tool_python.md + - "": "" + - "": "" + - "": "" + - "": "" - Advanced Features: - E2B Advanced Features: e2b_advanced_features.md - MiroAPI: miro_api.md - - Add New Tools: contribute_tools.md - - LLM Clients: + - πŸ€– LLM Clients: - Overview: llm_clients_overview.md - - Models: - - MiroThinker: mirothinker.md - - Claude-3.7-Sonnet: - - Official SDK: claude-3.7-sonnet.md - - OpenRouter: openrouter-claude-3.7-sonnet.md - - OpenAI-GPT: openai-gpt.md - - Add New LLM Clients: contribute_llm_clients.md - - - Resources: - - πŸ“š All About Agents: all_about_agents.md - - πŸ“Š Open Source Data: data.md - - πŸ“± Applications: applications.md - - πŸ› FAQs: faqs.md - - πŸ“ Contributors: contributors.md + - How to add new LLM clients: contribute_llm_clients.md + - "": "" + - "": "" + - "": "" + - "": "" + - MiroThinker: mirothinker.md + - Claude 3.7 Sonnet (Official SDK): claude-3.7-sonnet.md + - Claude 3.7 Sonnet (OpenRouter): openrouter-claude-3.7-sonnet.md + - GPT-5: openai-gpt5.md + - GPT-4o: openai-gpt4o.md + + - πŸ“š Resources: + - All About Agents: all_about_agents.md + - Open Source Data: data.md + - Applications: applications.md + - FAQs: faqs.md + - Contributors: contributors.md extra: diff --git a/scripts/run_evaluate_multiple_runs_finsearchcomp.sh b/scripts/run_evaluate_multiple_runs_finsearchcomp.sh index e7c90fe..b6d65aa 100755 --- a/scripts/run_evaluate_multiple_runs_finsearchcomp.sh +++ b/scripts/run_evaluate_multiple_runs_finsearchcomp.sh @@ -5,14 +5,14 @@ # SPDX-License-Identifier: Apache-2.0 # Multiple runs FinSearchComp evaluation script -# Based on the working command: uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir=logs/finsearchcomp/$(date +"%Y%m%d_%H%M") +# Based on the working command: uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir=logs/finsearchcomp/$(date +"%Y%m%d_%H%M") # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} MAX_TASKS=${MAX_TASKS:-1} MAX_CONCURRENT=${MAX_CONCURRENT:-5} BENCHMARK_NAME="finsearchcomp" -AGENT_SET=${AGENT_SET:-"agent_finsearchcomp"} +AGENT_SET=${AGENT_SET:-"agent_finsearchcomp_claude37sonnet"} # Set results directory with timestamp TIMESTAMP=$(date +%Y%m%d_%H%M) diff --git a/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh b/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh index c65f0dc..0cc11fb 100644 --- a/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh +++ b/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh @@ -5,9 +5,9 @@ # SPDX-License-Identifier: Apache-2.0 # Configuration parameters -NUM_RUNS=3 -AGENT_SET="agent_gaia-validation-text-only_mirothinker" -MAX_CONCURRENT=15 +NUM_RUNS=8 +AGENT_SET="agent_gaia-validation-text-only_mirothinker_single_agent" +MAX_CONCURRENT=8 # Set results directory with timestamp TIMESTAMP=$(date +%Y%m%d_%H%M) diff --git a/scripts/run_evaluate_multiple_runs_xbench-ds.sh b/scripts/run_evaluate_multiple_runs_xbench-ds.sh index a0026b1..bd166f7 100644 --- a/scripts/run_evaluate_multiple_runs_xbench-ds.sh +++ b/scripts/run_evaluate_multiple_runs_xbench-ds.sh @@ -6,7 +6,7 @@ # Configuration parameters NUM_RUNS=3 -AGENT_SET="agent_xbench-ds" +AGENT_SET="agent_xbench-ds_claude37sonnet" BENCHMARK_NAME="xbench-ds" MAX_CONCURRENT=5 export CHINESE_CONTEXT="true" diff --git a/src/llm/providers/claude_anthropic_client.py b/src/llm/providers/claude_anthropic_client.py index d701d49..3e92537 100644 --- a/src/llm/providers/claude_anthropic_client.py +++ b/src/llm/providers/claude_anthropic_client.py @@ -29,17 +29,19 @@ def __post_init__(self): def _create_client(self, config: DictConfig): """Create Anthropic client""" - api_key = config.env.anthropic_api_key + api_key = self.cfg.llm.anthropic_api_key if self.async_client: return AsyncAnthropic( api_key=api_key, base_url=self.cfg.llm.anthropic_base_url, + timeout=600.0, # 10 minutes timeout for long requests ) else: return Anthropic( api_key=api_key, base_url=self.cfg.llm.anthropic_base_url, + timeout=600.0, # 10 minutes timeout for long requests ) @retry(wait=wait_fixed(10), stop=stop_after_attempt(5)) diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py index 52035fe..1104582 100755 --- a/utils/progress_check/check_finsearchcomp_progress.py +++ b/utils/progress_check/check_finsearchcomp_progress.py @@ -348,7 +348,7 @@ def main(): print(f"Error: {e}") print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]") print( - f"Example: python {sys.argv[0]} logs/finsearchcomp/agent_finsearchcomp_20250924_1555" + f"Example: python {sys.argv[0]} logs/finsearchcomp/agent_finsearchcomp_claude37sonnet_20250924_1555" ) return 1