Skip to content

Commit 55a27bd

Browse files
authored
fix(lint): to pass lint for some files (#45)
to pass lint for some files
1 parent 28366a7 commit 55a27bd

File tree

4 files changed

+93
-54
lines changed

4 files changed

+93
-54
lines changed

utils/extract_futurex_results.py

Lines changed: 39 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
Usage:
1616
# Extract from single run
1717
python extract_futurex_results.py logs/futurex-online-test
18-
18+
1919
# Aggregate multiple runs (if run_* subdirectories exist)
2020
python extract_futurex_results.py logs/futurex-online-multi-runs
21-
21+
2222
# Specify output file
2323
python extract_futurex_results.py logs/futurex-online-test -o my_submission.jsonl
2424
"""
@@ -82,25 +82,27 @@ def discover_runs(results_dir: str) -> List[str]:
8282
def extract_predictions_from_file(file_path: str) -> Dict[str, str]:
8383
"""
8484
Extract predictions from a single benchmark_results.jsonl file.
85-
85+
8686
Args:
8787
file_path: Path to benchmark_results.jsonl file
88-
88+
8989
Returns:
9090
Dictionary mapping task_id to prediction
9191
"""
9292
predictions = {}
93-
93+
9494
with open(file_path, "r", encoding="utf-8") as fin:
9595
for line_num, line in enumerate(fin, 1):
9696
line = line.strip()
9797
if not line:
9898
continue
99-
99+
100100
try:
101101
rec = json.loads(line)
102102
except json.JSONDecodeError as e:
103-
print(f"Warning: Skipping malformed JSON at line {line_num} in {file_path}: {e}")
103+
print(
104+
f"Warning: Skipping malformed JSON at line {line_num} in {file_path}: {e}"
105+
)
104106
continue
105107

106108
task_id = rec.get("task_id")
@@ -110,17 +112,19 @@ def extract_predictions_from_file(file_path: str) -> Dict[str, str]:
110112
if task_id and pred is not None and str(pred).strip():
111113
pred_str = str(pred).strip()
112114
predictions[task_id] = pred_str
113-
115+
114116
return predictions
115117

116118

117-
def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dict[str, int]]:
119+
def aggregate_multiple_runs(
120+
results_dir: str,
121+
) -> Tuple[Dict[str, List[str]], Dict[str, int]]:
118122
"""
119123
Aggregate predictions from multiple runs in subdirectories.
120-
124+
121125
Args:
122126
results_dir: Directory containing run_* subdirectories
123-
127+
124128
Returns:
125129
Tuple of (predictions_by_task, first_seen_order)
126130
"""
@@ -145,7 +149,7 @@ def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dic
145149
for run_dir in runs:
146150
fpath = os.path.join(run_dir, "benchmark_results.jsonl")
147151
print(f"Reading: {fpath}")
148-
152+
149153
with open(fpath, "r", encoding="utf-8") as fin:
150154
for line in fin:
151155
total_lines += 1
@@ -172,41 +176,41 @@ def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dic
172176

173177
print(f"Collected from {len(runs)} run(s).")
174178
print(f"Read {total_lines} line(s), accepted {used_lines} record(s).")
175-
179+
176180
return preds_by_task, first_seen_order
177181

178182

179183
def process_single_run(results_dir: str) -> Dict[str, str]:
180184
"""
181185
Process a single run (direct benchmark_results.jsonl file).
182-
186+
183187
Args:
184188
results_dir: Directory containing benchmark_results.jsonl
185-
189+
186190
Returns:
187191
Dictionary mapping task_id to prediction
188192
"""
189193
file_path = os.path.join(results_dir, "benchmark_results.jsonl")
190-
194+
191195
if not os.path.isfile(file_path):
192196
raise FileNotFoundError(f"benchmark_results.jsonl not found in: {results_dir}")
193-
197+
194198
print(f"Reading single run: {file_path}")
195199
predictions = extract_predictions_from_file(file_path)
196200
print(f"Extracted {len(predictions)} predictions from single run.")
197-
201+
198202
return predictions
199203

200204

201205
def write_submission_file(
202-
predictions: Dict[str, str],
203-
output_file: str,
206+
predictions: Dict[str, str],
207+
output_file: str,
204208
is_aggregated: bool = False,
205-
vote_counts: Dict[str, Dict[str, int]] = None
209+
vote_counts: Dict[str, Dict[str, int]] = None,
206210
) -> None:
207211
"""
208212
Write predictions to FutureX submission format.
209-
213+
210214
Args:
211215
predictions: Dictionary mapping task_id to prediction
212216
output_file: Output file path
@@ -217,14 +221,14 @@ def write_submission_file(
217221
with open(output_file, "w", encoding="utf-8") as out:
218222
for task_id in sorted(predictions.keys()):
219223
prediction = predictions[task_id]
220-
224+
221225
# Create submission record
222226
record = {"id": task_id, "prediction": prediction}
223-
227+
224228
# Add vote information for aggregated runs
225229
if is_aggregated and vote_counts and task_id in vote_counts:
226230
record["vote_counts"] = vote_counts[task_id]
227-
231+
228232
out.write(json.dumps(record, ensure_ascii=False) + "\n")
229233
num_tasks += 1
230234

@@ -238,7 +242,7 @@ def write_submission_file(
238242
def parse_args() -> argparse.Namespace:
239243
parser = argparse.ArgumentParser(
240244
description="Extract predictions from MiroFlow benchmark results and create FutureX submission files. "
241-
"Supports both single runs and multi-run aggregation with majority voting."
245+
"Supports both single runs and multi-run aggregation with majority voting."
242246
)
243247
parser.add_argument(
244248
"results_dir",
@@ -257,7 +261,7 @@ def parse_args() -> argparse.Namespace:
257261
)
258262
parser.add_argument(
259263
"--single",
260-
action="store_true",
264+
action="store_true",
261265
help="Force single run mode (look for direct benchmark_results.jsonl)",
262266
)
263267
return parser.parse_args()
@@ -279,7 +283,7 @@ def main() -> None:
279283
# Determine processing mode
280284
runs = discover_runs(results_dir)
281285
single_file = os.path.join(results_dir, "benchmark_results.jsonl")
282-
286+
283287
if args.aggregate:
284288
if not runs:
285289
raise FileNotFoundError(
@@ -312,18 +316,20 @@ def main() -> None:
312316
if mode == "aggregate":
313317
# Multi-run aggregation with majority voting
314318
preds_by_task, first_seen_order = aggregate_multiple_runs(results_dir)
315-
319+
316320
# Apply majority voting
317321
final_predictions = {}
318322
vote_counts = {}
319-
323+
320324
for task_id in preds_by_task:
321325
voted_pred, counts = majority_vote(preds_by_task[task_id], first_seen_order)
322326
final_predictions[task_id] = voted_pred
323327
vote_counts[task_id] = counts
324-
325-
write_submission_file(final_predictions, output_file, is_aggregated=True, vote_counts=vote_counts)
326-
328+
329+
write_submission_file(
330+
final_predictions, output_file, is_aggregated=True, vote_counts=vote_counts
331+
)
332+
327333
else:
328334
# Single run extraction
329335
predictions = process_single_run(results_dir)

utils/prepare_benchmark/gen_futurex.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,16 @@
1212
def gen_futurex(hf_token: str) -> Generator[Task, None, None]:
1313
"""
1414
Generate Futurex-Online dataset tasks in MiroFlow format
15-
15+
1616
Args:
1717
hf_token: Hugging Face token for dataset access
18-
18+
1919
Yields:
2020
Task: Standardized task objects
2121
"""
2222
# Load the Futurex-Online dataset
2323
dataset = load_dataset("futurex-ai/Futurex-Online")
24-
24+
2525
# Process each split in the dataset
2626
for split_name, split_data in dataset.items():
2727
for idx, sample in enumerate(split_data):
@@ -30,26 +30,26 @@ def gen_futurex(hf_token: str) -> Generator[Task, None, None]:
3030
task_question = sample.get("prompt", "")
3131
end_time = sample.get("end_time", "")
3232
level = sample.get("level", "")
33-
33+
3434
# Create metadata dictionary
3535
metadata: MutableMapping = {
3636
"level": level,
3737
"end_time": end_time,
3838
"source": "futurex-ai/Futurex-Online",
3939
"split": split_name,
4040
"original_id": sample.get("id", ""),
41-
"dataset_name": "Futurex-Online"
41+
"dataset_name": "Futurex-Online",
4242
}
43-
43+
4444
# Create standardized Task object
4545
task = Task(
4646
task_id=task_id,
4747
task_question=task_question,
4848
ground_truth="", # Futurex-Online doesn't have ground truth
49-
file_path=None, # No file attachments
49+
file_path=None, # No file attachments
5050
metadata=metadata,
5151
)
52-
52+
5353
yield task
5454

5555
return

utils/prepare_benchmark/gen_xbench_ds.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@ def xor_decrypt(data, key):
1414
"""
1515
XOR decrypt data with a key
1616
"""
17-
key_bytes = key.encode('utf-8')
17+
key_bytes = key.encode("utf-8")
1818
key_length = len(key_bytes)
1919
return bytes([data[i] ^ key_bytes[i % key_length] for i in range(len(data))])
2020

21+
2122
def gen_xbench_ds(hf_token: str) -> Generator[Task, None, None]:
2223
dataset = load_dataset(
2324
"xbench/DeepSearch",
@@ -28,9 +29,15 @@ def gen_xbench_ds(hf_token: str) -> Generator[Task, None, None]:
2829
task_id = metadata.pop("id")
2930

3031
key = metadata.pop("canary")
31-
prompt = xor_decrypt(base64.b64decode(metadata.pop("prompt")), key).decode('utf-8')
32-
answer = xor_decrypt(base64.b64decode(metadata.pop("answer")), key).decode('utf-8')
33-
reference_steps = xor_decrypt(base64.b64decode(metadata.pop("reference_steps")), key).decode('utf-8')
32+
prompt = xor_decrypt(base64.b64decode(metadata.pop("prompt")), key).decode(
33+
"utf-8"
34+
)
35+
answer = xor_decrypt(base64.b64decode(metadata.pop("answer")), key).decode(
36+
"utf-8"
37+
)
38+
reference_steps = xor_decrypt(
39+
base64.b64decode(metadata.pop("reference_steps")), key
40+
).decode("utf-8")
3441
task = Task(
3542
task_id=task_id,
3643
task_question=prompt,

utils/progress_check/check_futurex_progress.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
55
This script analyzes Futurex-Online benchmark results in a log folder to count:
66
- Total files processed
7-
- Files with status "completed"
7+
- Files with status "completed"
88
- Files with predictions (final_boxed_answer)
99
- Files with errors
1010
@@ -68,7 +68,6 @@ def analyze_futurex_results(log_folder: str) -> Dict[str, int]:
6868
status = data.get("status", "").lower()
6969
final_answer = data.get("final_boxed_answer", "")
7070
error_msg = data.get("error", "")
71-
judge_result = data.get("judge_result", "")
7271

7372
# Count by status
7473
if status == "completed":
@@ -88,7 +87,14 @@ def analyze_futurex_results(log_folder: str) -> Dict[str, int]:
8887
# Count by prediction availability
8988
if final_answer and final_answer.strip():
9089
results["with_predictions"] += 1
91-
prediction_files.append((json_file.name, final_answer[:100] + "..." if len(final_answer) > 100 else final_answer))
90+
prediction_files.append(
91+
(
92+
json_file.name,
93+
final_answer[:100] + "..."
94+
if len(final_answer) > 100
95+
else final_answer,
96+
)
97+
)
9298
else:
9399
results["without_predictions"] += 1
94100

@@ -136,11 +142,17 @@ def display_results(
136142
with_errors = results["with_errors"]
137143

138144
print(f"Total files processed: {total:3d}")
139-
print(f"Files with status 'completed': {completed:3d} ({completed/total*100:.1f}%)")
145+
print(
146+
f"Files with status 'completed': {completed:3d} ({completed/total*100:.1f}%)"
147+
)
140148
print(f"Files with status 'running': {running:3d} ({running/total*100:.1f}%)")
141149
print(f"Files with status 'failed': {failed:3d} ({failed/total*100:.1f}%)")
142-
print(f"Files with predictions: {with_predictions:3d} ({with_predictions/total*100:.1f}%)")
143-
print(f"Files with errors: {with_errors:3d} ({with_errors/total*100:.1f}%)")
150+
print(
151+
f"Files with predictions: {with_predictions:3d} ({with_predictions/total*100:.1f}%)"
152+
)
153+
print(
154+
f"Files with errors: {with_errors:3d} ({with_errors/total*100:.1f}%)"
155+
)
144156
print(f"Files with parse errors: {results['parse_errors']:3d}")
145157

146158
if completed > 0:
@@ -200,10 +212,24 @@ def main():
200212

201213
try:
202214
print(f"Analyzing Futurex-Online benchmark results in: {log_folder}")
203-
results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files = analyze_futurex_results(
204-
log_folder
215+
(
216+
results,
217+
completed_files,
218+
running_files,
219+
failed_files,
220+
prediction_files,
221+
error_files,
222+
parse_error_files,
223+
) = analyze_futurex_results(log_folder)
224+
display_results(
225+
results,
226+
completed_files,
227+
running_files,
228+
failed_files,
229+
prediction_files,
230+
error_files,
231+
parse_error_files,
205232
)
206-
display_results(results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files)
207233

208234
except Exception as e:
209235
print(f"Error: {e}")

0 commit comments

Comments
 (0)