⚡️ Speed up function get_download_dir by 205%
#25
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 205% (2.05x) speedup for
get_download_dirinskyvern/forge/sdk/api/files.py⏱️ Runtime :
5.07 milliseconds→1.66 milliseconds(best of250runs)📝 Explanation and details
The optimization achieves a 204% speedup by eliminating redundant filesystem operations through two key changes:
What was optimized:
Pre-computed base directory: The downloads base path is calculated once at module import time (
_DOWNLOADS_BASE_DIR) and the base directory is created upfront, rather than reconstructing the path and checking/creating directories on every function call.Conditional directory creation: Added
os.path.exists()check before callingos.makedirs()to avoid unnecessary syscalls when the directory already exists.Why this is faster:
REPO_ROOT_DIR/downloadsstring building (9% time reduction in string formatting).os.makedirs()unconditionally on every invocation, which internally performs filesystem checks even withexist_ok=True. The optimized version skips this entirely when the directory exists (88.8% → 0% time spent in makedirs for existing directories).Performance characteristics:
os.path.exists()check is neededThis optimization is particularly effective for workloads where
get_download_dir()is called repeatedly with the same or previously-used run_ids, which appears common based on the test patterns showing significant performance gains in repeated access scenarios.✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
import os
import shutil
import uuid
from typing import Optional
imports
import pytest # used for our unit tests
from skyvern.forge.sdk.api.files import get_download_dir
Simulate REPO_ROOT_DIR for testing purposes
REPO_ROOT_DIR = os.path.abspath("test_repo_root")
from skyvern.forge.sdk.api.files import get_download_dir
========== BASIC TEST CASES ==========
def test_basic_valid_run_id():
"""Test with a simple valid run_id string."""
run_id = "test123"
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.21μs -> 2.68μs (206% faster)
def test_basic_run_id_with_special_chars():
"""Test with run_id containing special characters."""
run_id = "run_!@#$%^&*()"
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 7.71μs -> 2.47μs (212% faster)
def test_basic_run_id_is_numeric():
"""Test with a numeric run_id."""
run_id = "123456"
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 7.64μs -> 2.68μs (185% faster)
def test_basic_run_id_is_none():
"""Test with run_id as None."""
run_id = None
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.23μs -> 2.83μs (191% faster)
def test_basic_run_id_is_empty_string():
"""Test with run_id as empty string."""
run_id = ""
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.81μs -> 2.50μs (252% faster)
========== EDGE TEST CASES ==========
def test_edge_run_id_is_long_string():
"""Test with a very long run_id string."""
run_id = "a" * 255 # Typical filesystem limit
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.70μs -> 3.08μs (182% faster)
def test_edge_run_id_with_path_traversal():
"""Test with run_id containing path traversal characters."""
run_id = "../outside"
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.45μs -> 2.96μs (186% faster)
def test_edge_run_id_with_unicode():
"""Test with run_id containing unicode characters."""
run_id = "测试_🚀"
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 9.34μs -> 3.14μs (198% faster)
def test_edge_run_id_is_dot():
"""Test with run_id as '.' (current directory)."""
run_id = "."
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.36μs -> 2.75μs (204% faster)
def test_edge_run_id_is_dot_dot():
"""Test with run_id as '..' (parent directory)."""
run_id = ".."
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.22μs -> 2.68μs (207% faster)
def test_edge_run_id_is_slash():
"""Test with run_id as a slash (should create nested dirs)."""
run_id = "nested/dir"
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.60μs -> 2.79μs (208% faster)
def test_edge_existing_directory():
"""Test when the download directory already exists."""
run_id = "already_exists"
codeflash_output = get_download_dir(run_id); dir_path = codeflash_output # 8.64μs -> 2.85μs (204% faster)
# Create a file inside the directory
file_path = os.path.join(dir_path, "dummy.txt")
with open(file_path, "w") as f:
f.write("hello")
# Call again, should not remove the file
codeflash_output = get_download_dir(run_id); result = codeflash_output # 5.97μs -> 1.89μs (217% faster)
def test_edge_run_id_is_space():
"""Test with run_id as a space."""
run_id = " "
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.68μs -> 2.70μs (221% faster)
def test_edge_run_id_is_tab():
"""Test with run_id as a tab character."""
run_id = "\t"
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.18μs -> 2.62μs (212% faster)
def test_edge_run_id_is_newline():
"""Test with run_id as a newline character."""
run_id = "\n"
expected_dir = f"{REPO_ROOT_DIR}/downloads/{run_id}"
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.27μs -> 2.76μs (200% faster)
========== LARGE SCALE TEST CASES ==========
def test_large_scale_many_download_dirs():
"""Test creating many download directories to check scalability."""
num_dirs = 500 # Reasonable limit for test environment
run_ids = [f"run_{i}" for i in range(num_dirs)]
paths = []
for run_id in run_ids:
codeflash_output = get_download_dir(run_id); path = codeflash_output # 1.81ms -> 597μs (203% faster)
paths.append(path)
for path in paths:
pass
def test_large_scale_long_unique_run_ids():
"""Test with many long, unique run_ids (simulate UUIDs)."""
num_dirs = 100
run_ids = [str(uuid.uuid4()) for _ in range(num_dirs)]
paths = []
for run_id in run_ids:
codeflash_output = get_download_dir(run_id); path = codeflash_output # 366μs -> 118μs (210% faster)
paths.append(path)
for path in paths:
pass
def test_large_scale_nested_run_ids():
"""Test with nested run_ids to create deep directory structures."""
num_dirs = 50
run_ids = [f"level1/level2/level3/run_{i}" for i in range(num_dirs)]
paths = []
for run_id in run_ids:
codeflash_output = get_download_dir(run_id); path = codeflash_output # 201μs -> 65.7μs (207% faster)
paths.append(path)
for path in paths:
pass
def test_large_scale_repeated_calls():
"""Test repeated calls to the same run_id."""
run_id = "repeat_test"
for _ in range(100):
codeflash_output = get_download_dir(run_id); path = codeflash_output # 359μs -> 116μs (210% faster)
#------------------------------------------------
import os
import shutil
import tempfile
imports
import pytest
from skyvern.forge.sdk.api.files import get_download_dir
--- Function to test ---
Simulate the REPO_ROOT_DIR constant as it would be imported in the actual code
REPO_ROOT_DIR = tempfile.mkdtemp(prefix="repo_root_dir_") # Use a temp dir for isolation
from skyvern.forge.sdk.api.files import get_download_dir
---------------------------
1. Basic Test Cases
---------------------------
def test_basic_valid_run_id_str():
"""Test with a standard run_id string."""
run_id = "abc123"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 10.2μs -> 2.60μs (291% faster)
def test_basic_numeric_run_id():
"""Test with a numeric run_id."""
run_id = "456789"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 9.12μs -> 2.59μs (253% faster)
def test_basic_alphanumeric_run_id():
"""Test with an alphanumeric run_id."""
run_id = "run42test"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.77μs -> 2.67μs (229% faster)
def test_basic_run_id_with_dash_underscore():
"""Test with run_id containing dashes and underscores."""
run_id = "run-42_test"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.69μs -> 2.76μs (215% faster)
---------------------------
2. Edge Test Cases
---------------------------
def test_edge_run_id_none():
"""Test with run_id as None (should create .../downloads/None)."""
run_id = None
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", "None")
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.81μs -> 2.92μs (202% faster)
def test_edge_run_id_empty_string():
"""Test with run_id as an empty string (should create .../downloads/)."""
run_id = ""
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", "")
codeflash_output = get_download_dir(run_id); result = codeflash_output # 9.42μs -> 2.73μs (246% faster)
def test_edge_run_id_special_characters():
"""Test with run_id containing special filesystem-safe characters."""
run_id = "run!@#$%^&()[]{}"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.65μs -> 2.75μs (215% faster)
def test_edge_run_id_unicode():
"""Test with run_id containing unicode characters."""
run_id = "测试下载"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 9.49μs -> 3.20μs (197% faster)
def test_edge_run_id_dot_dot_slash():
"""Test with run_id containing path traversal (should not escape downloads/)."""
run_id = "../escape"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.76μs -> 2.89μs (203% faster)
def test_edge_run_id_long_string():
"""Test with a very long run_id string (up to 255 chars, typical FS limit)."""
run_id = "a" * 255
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 9.05μs -> 3.11μs (191% faster)
def test_edge_existing_directory():
"""Test when the directory already exists (should not error)."""
run_id = "existing_dir"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
# Pre-create the directory
os.makedirs(expected_dir, exist_ok=True)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 5.16μs -> 1.78μs (190% faster)
def test_edge_run_id_with_slash():
"""Test with run_id containing a slash (should create nested directories)."""
run_id = "nested/dir"
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", "nested", "dir")
codeflash_output = get_download_dir(run_id); result = codeflash_output # 8.19μs -> 2.62μs (212% faster)
---------------------------
3. Large Scale Test Cases
---------------------------
def test_large_scale_many_run_ids():
"""Test creating many download directories in a loop."""
run_ids = [f"run_{i}" for i in range(500)] # 500 is large but under 1000
created_dirs = []
for run_id in run_ids:
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 1.82ms -> 599μs (203% faster)
created_dirs.append(result)
for d in created_dirs:
pass
def test_large_scale_long_run_ids():
"""Test creating directories with long run_ids in bulk."""
run_ids = [str(i) * 50 for i in range(20)] # Each run_id is 50 chars, 20 dirs
for run_id in run_ids:
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", run_id)
codeflash_output = get_download_dir(run_id); result = codeflash_output # 82.5μs -> 27.2μs (204% faster)
def test_large_scale_nested_run_ids():
"""Test creating nested directories using slashes in run_id."""
run_ids = [f"group{i}/subgroup{j}" for i in range(10) for j in range(5)] # 50 nested dirs
for run_id in run_ids:
expected_dir = os.path.join(REPO_ROOT_DIR, "downloads", *run_id.split("/"))
codeflash_output = get_download_dir(run_id); result = codeflash_output # 194μs -> 63.4μs (207% faster)
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
To edit these changes
git checkout codeflash/optimize-get_download_dir-mhtybgvtand push.