Skip to content

Commit f7047fd

Browse files
committed
system prompt addition
1 parent 63bc1e7 commit f7047fd

File tree

1 file changed

+32
-16
lines changed

1 file changed

+32
-16
lines changed

hud_eval.py

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,23 @@
1818
from hud.task import Task
1919
from hud.taskset import load_taskset
2020
from hud.job import create_job, Job
21-
from hud.adapters.common.types import ResponseAction
21+
from hud.adapters.common.types import ResponseAction, CustomAction
2222

23+
OSWORLD_SYSTEM_PROMPT="""
24+
Please do the following task.
25+
computer.clipboard, computer.sync file, computer.sync shared folder, computer.computer output citation are disabled.
26+
If you worry that you might make typo, prefer copying and pasting the text instead of reading
27+
and typing.
28+
If you are presented with an open website to solve the task, try to stick to that specific one
29+
instead of going to a new one.
30+
You have full authority to execute any action without my permission. I won't be watching so
31+
please don't ask for confirmation.
32+
My gmail account is [email protected], and the password is "iloveosworld500", if prompted for OTP, use the authenticator chrome extension to see the OTP for 2 factor authentication.
33+
If you deem the task is infeasible, you can terminate and explicitly state in the response that
34+
'the task is infeasible'. Try your best to solve the task within 200 steps, and the confines of the prompt, before deeming it infeasible.
35+
"""
2336

24-
def run_task(task: Task, model_name: str, job: Job) -> float:
37+
def run_task(task: Task, model_name: str, job: Job, system_prompt: str) -> float:
2538
"""Run a single task and return reward"""
2639
hud_computer = None
2740
try:
@@ -31,20 +44,25 @@ def run_task(task: Task, model_name: str, job: Job) -> float:
3144
with hud_computer as browser_computer:
3245
agent = BrowserAgent(
3346
browser_computer=browser_computer,
34-
query=task.prompt,
47+
query=(system_prompt + "\n\n" + task.prompt).strip(),
3548
model_name=model_name,
3649
verbose=False,
3750
)
3851
try:
3952
agent.agent_loop()
4053

4154
if agent.final_reasoning:
42-
response_action = ResponseAction(
43-
text=agent.final_reasoning
44-
)
55+
if "the task is infeasible" in agent.final_reasoning.lower():
56+
final_action = CustomAction(
57+
action="FAIL"
58+
)
59+
else:
60+
final_action = ResponseAction(
61+
text=agent.final_reasoning
62+
)
4563
# Inject the response into HUD environment
4664
hud_computer._loop.run_until_complete(
47-
hud_computer._env.step([response_action])
65+
hud_computer._env.step([final_action])
4866
)
4967

5068
except Exception as e:
@@ -78,27 +96,31 @@ def run_taskset(
7896
name: str,
7997
parallel: bool = False,
8098
max_concurrent: int = 20,
81-
api_key: str = None
8299
) -> list[float]:
83100
"""Load and run a HUD taskset by ID, return list of rewards"""
84101

85102
# Load the taskset
86103
taskset = asyncio.run(load_taskset(taskset_id, metadata={"partial": True}))
87104

88105
job = asyncio.run(create_job(name, evalset_id=taskset.id))
106+
107+
if taskset_id == "OSWorld-Verified":
108+
system_prompt = OSWORLD_SYSTEM_PROMPT
109+
else:
110+
system_prompt = ""
89111

90112
if parallel:
91113
# Run tasks in parallel using threads to avoid event loop conflicts
92114
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
93115
rewards = list(executor.map(
94-
lambda task: run_task(task, model_name, job),
116+
lambda task: run_task(task, model_name, job, system_prompt),
95117
taskset.tasks
96118
))
97119
else:
98120
# Run tasks sequentially
99121
rewards = []
100122
for task in taskset.tasks:
101-
reward = run_task(task, model_name, job)
123+
reward = run_task(task, model_name, job, system_prompt)
102124
rewards.append(reward)
103125

104126
return rewards
@@ -128,11 +150,6 @@ def main() -> int:
128150
default="computer-use-exp-07-16",
129151
help="Set which model to use.",
130152
)
131-
parser.add_argument(
132-
"--api_key",
133-
type=str,
134-
help="HUD API key (defaults to environment variable).",
135-
)
136153
parser.add_argument(
137154
"--max_concurrent",
138155
type=int,
@@ -148,7 +165,6 @@ def main() -> int:
148165
name=args.name,
149166
parallel=args.parallel,
150167
max_concurrent=args.max_concurrent,
151-
api_key=args.api_key or os.environ.get("HUD_API_KEY")
152168
)
153169

154170
# Print minimal results

0 commit comments

Comments
 (0)