Skip to content

Commit 7881069

Browse files
committed
added support for Gemini and Ollama, plus updated readme to reflect these updates/Added new command for debug mode which should fix the issue for Windows Chrome extension issue
1 parent bb281da commit 7881069

File tree

8 files changed

+657
-15
lines changed

8 files changed

+657
-15
lines changed

Chrome_extension/src/inject/inject.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,7 @@
464464
method: "POST",
465465
headers: {
466466
"Content-Type": "application/json",
467+
"Origin": "http://127.0.0.1"
467468
},
468469
body: JSON.stringify({ task: taskWithContext }),
469470
});

Python_server/.env.example

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
OPENAI_API_KEY=yourAPIKeyHere
1+
OPENAI_API_KEY=yourAPIKeyHere
2+
GEMINI_API_KEY=youtAPIKeyHere

Python_server/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ This directory contains the Python server component of the A5 Browser Automation
66

77
The server consists of the following key components:
88

9-
- `main.py`: Main server application
9+
- `main.py`: Main server application with OpenAI
10+
- `mainGemini.py`: Main server application using Gemini
11+
- `mainOllama.py`: Main server application using Ollama (you must run `ollama pull Qwen2.5-Coder:32B-Instruct-q4_K_M` for this to work and it requires about 20GB of harddrive space)
1012
- `utils/`: Future: Utility functions and helpers
1113
- `models/`: Future: Data models and database schemas
1214
- `config/`: Future: Configuration files and environment variables
@@ -24,7 +26,7 @@ To set up the development environment:
2426

2527

2628
[GET] `/lastResponses` returns the browser-use responses from the end of sessions
27-
[GET] `/run` : Parameter: `task`. The `task` parameter is the string being passed to the intitial command for browser-use.
29+
[GET] or [POST] `/run` : Parameter: `task`. The `task` parameter is the string being passed to the intitial command for browser-use.
2830

2931
## Example Request
3032
```

Python_server/agent_history.gif

-24.9 KB
Loading

Python_server/mainGemini.py

Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
# main.py
2+
3+
# Important Instructions:
4+
# 1. Close any existing Chrome instances.
5+
# 2. Start Chrome with remote debugging enabled:
6+
# /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
7+
# 3. Run the FastAPI server:
8+
# uvicorn main:app --host 127.0.0.1 --port 8888 --reload --workers 1
9+
# make sure you set OPENAI_API_KEY=yourOpenAIKeyHere to .env file
10+
11+
import os
12+
os.environ["PYDANTIC_V1_COMPAT_MODE"] = "true"
13+
14+
from langchain_google_genai import ChatGoogleGenerativeAI
15+
from browser_use import Agent
16+
from dotenv import load_dotenv
17+
import platform
18+
import asyncio
19+
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
20+
from pydantic import BaseModel
21+
from pydantic import SecretStr
22+
from browser_use.browser.browser import Browser, BrowserConfig
23+
import logging
24+
import traceback
25+
from datetime import datetime
26+
from typing import List, Optional
27+
from enum import Enum
28+
from fastapi.middleware.cors import CORSMiddleware
29+
30+
31+
32+
# ----------------------------
33+
# 1. Configure Logging
34+
# ----------------------------
35+
logging.basicConfig(level=logging.INFO)
36+
logger = logging.getLogger(__name__)
37+
38+
# ----------------------------
39+
# 2. Load Environment Variables
40+
# ----------------------------
41+
load_dotenv()
42+
43+
# Verify the OpenAI API key is loaded
44+
api_key = os.getenv("GEMINI_API_KEY")
45+
if not api_key:
46+
raise ValueError(
47+
"GEMINI_API_KEY not found in .env file. Make sure your .env file is set up correctly."
48+
)
49+
50+
# ----------------------------
51+
# 3. Initialize FastAPI App
52+
# ----------------------------
53+
app = FastAPI(title="AI Agent API with BrowserUse", version="1.0")
54+
55+
56+
# Configure CORS
57+
app.add_middleware(
58+
CORSMiddleware,
59+
allow_origins=["*"], # For development: allow all origins. In production, specify exact origins.
60+
allow_credentials=True,
61+
allow_methods=["*"],
62+
allow_headers=["*"],
63+
)
64+
65+
# ----------------------------
66+
# 4. Define Pydantic Models
67+
# ----------------------------
68+
69+
class TaskRequest(BaseModel):
70+
task: str
71+
72+
class TaskResponse(BaseModel):
73+
result: str
74+
75+
class TaskStatus(str, Enum):
76+
RUNNING = "running"
77+
COMPLETED = "completed"
78+
FAILED = "failed"
79+
80+
class TaskRecord(BaseModel):
81+
id: int
82+
task: str
83+
status: TaskStatus
84+
start_time: datetime
85+
end_time: Optional[datetime] = None
86+
duration: Optional[float] = None # Duration in seconds
87+
result: Optional[str] = None
88+
error: Optional[str] = None
89+
90+
# ----------------------------
91+
# 5. Initialize Task Registry
92+
# ----------------------------
93+
task_records: List[TaskRecord] = []
94+
task_id_counter: int = 0
95+
task_lock = asyncio.Lock() # To manage concurrent access to task_records
96+
97+
# ----------------------------
98+
# 6. Define Background Task Function
99+
# ----------------------------
100+
101+
102+
def get_chrome_path() -> str:
103+
"""
104+
Returns the most common Chrome executable path based on the operating system.
105+
Raises:
106+
FileNotFoundError: If Chrome is not found in the expected path.
107+
"""
108+
system = platform.system()
109+
110+
if system == "Windows":
111+
# Common installation path for Windows
112+
chrome_path = os.path.join(
113+
os.environ.get("PROGRAMFILES", "C:\\Program Files"),
114+
"Google\\Chrome\\Application\\chrome.exe"
115+
)
116+
elif system == "Darwin":
117+
# Common installation path for macOS
118+
chrome_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
119+
elif system == "Linux":
120+
# Common installation path for Linux
121+
chrome_path = "/usr/bin/google-chrome"
122+
else:
123+
raise FileNotFoundError(f"Unsupported operating system: {system}")
124+
125+
# Verify that the Chrome executable exists at the determined path
126+
if not os.path.exists(chrome_path):
127+
raise FileNotFoundError(f"Google Chrome executable not found at: {chrome_path}")
128+
129+
return chrome_path
130+
131+
132+
133+
async def execute_task(task_id: int, task: str):
134+
"""
135+
Background task to execute the AI agent.
136+
Initializes a new browser instance for each task to ensure isolation.
137+
"""
138+
global task_records
139+
browser = None # Initialize browser instance for this task
140+
try:
141+
logger.info(f"Starting background task ID {task_id}: {task}")
142+
143+
# Create and add the task record with status 'running'
144+
async with task_lock:
145+
task_record = TaskRecord(
146+
id=task_id,
147+
task=task,
148+
status=TaskStatus.RUNNING,
149+
start_time=datetime.utcnow()
150+
)
151+
task_records.append(task_record)
152+
153+
# Initialize a new browser instance for this task
154+
logger.info(f"Task ID {task_id}: Initializing new browser instance.")
155+
browser = Browser(
156+
config=BrowserConfig(
157+
chrome_instance_path=get_chrome_path(), # Update if different
158+
disable_security=True,
159+
headless=False, # Set to True for headless mode
160+
# Removed 'remote_debugging_port' as it caused issues
161+
)
162+
)
163+
logger.info(f"Task ID {task_id}: Browser initialized successfully.")
164+
165+
# Initialize and run the Agent with the new browser instance
166+
agent = Agent(
167+
task=task,
168+
llm=ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key)),
169+
browser=browser
170+
)
171+
logger.info(f"Task ID {task_id}: Agent initialized. Running task.")
172+
result = await agent.run()
173+
logger.info(f"Task ID {task_id}: Agent.run() completed successfully.")
174+
175+
# Update the task record with status 'completed'
176+
async with task_lock:
177+
for record in task_records:
178+
if record.id == task_id:
179+
record.status = TaskStatus.COMPLETED
180+
record.end_time = datetime.utcnow()
181+
record.duration = (record.end_time - record.start_time).total_seconds()
182+
record.result = result
183+
break
184+
185+
except Exception as e:
186+
logger.error(f"Error in background task ID {task_id}: {e}")
187+
logger.error(traceback.format_exc())
188+
189+
# Update the task record with status 'failed'
190+
async with task_lock:
191+
for record in task_records:
192+
if record.id == task_id:
193+
record.status = TaskStatus.FAILED
194+
record.end_time = datetime.utcnow()
195+
record.duration = (record.end_time - record.start_time).total_seconds()
196+
record.error = str(e)
197+
break
198+
finally:
199+
# Ensure that the browser is closed in case of failure or success
200+
if browser:
201+
try:
202+
logger.info(f"Task ID {task_id}: Closing browser instance.")
203+
await browser.close()
204+
logger.info(f"Task ID {task_id}: Browser instance closed successfully.")
205+
except Exception as close_e:
206+
logger.error(f"Task ID {task_id}: Error closing browser: {close_e}")
207+
logger.error(traceback.format_exc())
208+
209+
# ----------------------------
210+
# 7. Define POST /run Endpoint
211+
# ----------------------------
212+
@app.post("/run", response_model=TaskResponse)
213+
async def run_task_post(request: TaskRequest, background_tasks: BackgroundTasks):
214+
"""
215+
POST Endpoint to run the AI agent with a specified task.
216+
217+
- **task**: The task description for the AI agent.
218+
"""
219+
global task_id_counter
220+
task = request.task
221+
logger.info(f"Received task via POST: {task}")
222+
223+
# Increment task ID
224+
async with task_lock:
225+
task_id_counter += 1
226+
current_task_id = task_id_counter
227+
228+
# Enqueue the background task
229+
background_tasks.add_task(execute_task, current_task_id, task)
230+
231+
# Respond immediately
232+
return TaskResponse(result="Task is being processed.")
233+
234+
# ----------------------------
235+
# 8. Define GET /run Endpoint
236+
# ----------------------------
237+
@app.get("/run", response_model=TaskResponse)
238+
async def run_task_get(
239+
task: str = Query(..., description="The task description for the AI agent."),
240+
background_tasks: BackgroundTasks = None
241+
):
242+
"""
243+
GET Endpoint to run the AI agent with a specified task.
244+
245+
- **task**: The task description for the AI agent.
246+
"""
247+
global task_id_counter
248+
logger.info(f"Received task via GET: {task}")
249+
250+
# Increment task ID
251+
async with task_lock:
252+
task_id_counter += 1
253+
current_task_id = task_id_counter
254+
255+
# Enqueue the background task
256+
background_tasks.add_task(execute_task, current_task_id, task)
257+
258+
# Respond immediately
259+
return TaskResponse(result="Task is being processed.")
260+
261+
# ----------------------------
262+
# 9. Define GET /lastResponses Endpoint
263+
# ----------------------------
264+
@app.get("/lastResponses", response_model=List[TaskRecord])
265+
async def get_last_responses(
266+
limit: Optional[int] = Query(100, description="Maximum number of task records to return"),
267+
status: Optional[TaskStatus] = Query(None, description="Filter by task status")
268+
):
269+
"""
270+
GET Endpoint to retrieve the last task responses.
271+
272+
- **limit**: The maximum number of task records to return (default: 100).
273+
- **status**: (Optional) Filter tasks by status ('running', 'completed', 'failed').
274+
275+
Returns a list of task records in descending order of task ID.
276+
"""
277+
async with task_lock:
278+
filtered_tasks = task_records.copy()
279+
if status:
280+
filtered_tasks = [task for task in filtered_tasks if task.status == status]
281+
# Sort and limit
282+
sorted_tasks = sorted(filtered_tasks, key=lambda x: x.id, reverse=True)[:limit]
283+
return sorted_tasks
284+
285+
# ----------------------------
286+
# 10. Define Root Endpoint
287+
# ----------------------------
288+
@app.get("/")
289+
def read_root():
290+
return {
291+
"message": "AI Agent API with BrowserUse is running. Use the /run endpoint with a 'task' field in the POST request body or as a query parameter in a GET request to execute tasks."
292+
}
293+
294+
#For executable.
295+
# ----------------------------
296+
# 12. Entry Point
297+
# ----------------------------
298+
if __name__ == "__main__":
299+
import uvicorn
300+
301+
uvicorn.run("mainGemini:app", host="127.0.0.1", port=8888, reload=True, workers=1)

0 commit comments

Comments
 (0)