Skip to content

Commit 2284f7b

Browse files
committed
improve get_smallest_cron_interval
1 parent bb4a8bb commit 2284f7b

File tree

4 files changed

+886
-150
lines changed

4 files changed

+886
-150
lines changed

python_modules/dagster/dagster/_utils/schedules.py

Lines changed: 162 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -866,13 +866,166 @@ def get_smallest_cron_interval(
866866
cron_string: str,
867867
execution_timezone: Optional[str] = None,
868868
) -> datetime.timedelta:
869-
"""Find the smallest interval between cron ticks for a given cron schedule.
869+
"""Find the smallest interval between cron ticks for a given cron schedule using deterministic
870+
analysis of the cron pattern.
870871
871-
Uses a sampling-based approach to find the minimum interval by generating
872+
This function parses the cron string and algebraically determines the minimum interval without
873+
sampling. This is more efficient and deterministic than get_smallest_cron_interval() for most
874+
common patterns.
875+
876+
For complex patterns that cannot be analyzed deterministically (e.g., patterns with both
877+
day-of-month AND day-of-week constraints, or irregular intervals), this falls back to the
878+
sampling-based approach.
879+
880+
Args:
881+
cron_string: A cron string
882+
execution_timezone: Timezone to use for cron evaluation (only used for fallback)
883+
884+
Returns:
885+
The smallest timedelta between any two consecutive cron ticks
886+
887+
Raises:
888+
CheckError: If the cron string is invalid or not recognized by Dagster
889+
"""
890+
check.invariant(
891+
is_valid_cron_string(cron_string), desc=f"{cron_string} must be a valid cron string"
892+
)
893+
894+
# Parse the cron string into its components: [minutes, hours, day_of_month, month, day_of_week]
895+
# Each component is a list of int or '*'
896+
cron_parts, nth_weekday_of_month, *_ = CroniterShim.expand(cron_string)
897+
898+
# If nth_weekday_of_month is used (e.g., "first Monday of the month"), fall back to sampling
899+
if nth_weekday_of_month:
900+
return _get_smallest_cron_interval_with_sampling(cron_string, execution_timezone)
901+
902+
minutes, hours, days_of_month, months, days_of_week = cron_parts
903+
904+
# Helper function to get smallest gap in a sorted list of integers
905+
def get_smallest_gap(values: list[int], wrap_at: Optional[int] = None) -> Optional[int]:
906+
"""Get the smallest gap between consecutive values in a sorted list.
907+
908+
Args:
909+
values: List of integer values
910+
wrap_at: If provided, also considers wrap-around gap (e.g., 60 for minutes)
911+
"""
912+
if len(values) < 2:
913+
return None
914+
sorted_values = sorted(values)
915+
916+
# Calculate gaps between consecutive values
917+
gaps = [sorted_values[i + 1] - sorted_values[i] for i in range(len(sorted_values) - 1)]
918+
919+
# If wrap_at is provided, also consider the wrap-around gap
920+
if wrap_at is not None:
921+
wrap_gap = (wrap_at - sorted_values[-1]) + sorted_values[0]
922+
gaps.append(wrap_gap)
923+
924+
return min(gaps)
925+
926+
# Determine if each field is constrained or wildcarded
927+
minutes_is_wildcard = len(minutes) == 1 and minutes[0] == "*"
928+
hours_is_wildcard = len(hours) == 1 and hours[0] == "*"
929+
days_of_month_is_wildcard = len(days_of_month) == 1 and days_of_month[0] == "*"
930+
months_is_wildcard = len(months) == 1 and months[0] == "*"
931+
days_of_week_is_wildcard = len(days_of_week) == 1 and days_of_week[0] == "*"
932+
933+
# If both day_of_month and day_of_week are constrained, they use OR logic which is complex
934+
# Fall back to sampling for these cases
935+
if not days_of_month_is_wildcard and not days_of_week_is_wildcard:
936+
return _get_smallest_cron_interval_with_sampling(cron_string, execution_timezone)
937+
938+
# Extract numeric values (filter out '*')
939+
minute_values = [m for m in minutes if m != "*"]
940+
hour_values = [h for h in hours if h != "*"]
941+
day_of_week_values = [d for d in days_of_week if d != "*"]
942+
943+
# Case 1: Minutes are wildcarded (* in minutes position)
944+
# This means the job runs every minute during the matching hours
945+
if minutes_is_wildcard:
946+
return datetime.timedelta(minutes=1)
947+
948+
# Case 2: Multiple minute values specified (e.g., "0,15,30,45")
949+
# The smallest interval is the minimum gap between minute values
950+
if len(minute_values) > 1:
951+
min_minute_gap = get_smallest_gap(minute_values, wrap_at=60)
952+
if min_minute_gap is not None:
953+
# If hours/days/months/weekdays are all wildcarded, this is the answer
954+
if (
955+
hours_is_wildcard
956+
and days_of_month_is_wildcard
957+
and months_is_wildcard
958+
and days_of_week_is_wildcard
959+
):
960+
return datetime.timedelta(minutes=min_minute_gap)
961+
# Otherwise, we need to consider if the time constraints might make consecutive ticks
962+
# happen at different hours/days. This is complex, so fall back to sampling.
963+
return _get_smallest_cron_interval_with_sampling(cron_string, execution_timezone)
964+
965+
# Case 3: Single minute value specified (e.g., "0" or "15")
966+
# Now we need to look at the hour constraints
967+
if len(minute_values) == 1:
968+
# If hours are wildcarded, runs every hour at that minute
969+
if hours_is_wildcard:
970+
# Check day/month/week constraints
971+
if days_of_month_is_wildcard and months_is_wildcard and days_of_week_is_wildcard:
972+
return datetime.timedelta(hours=1)
973+
# If days/months/weeks are constrained, fall back to sampling
974+
return _get_smallest_cron_interval_with_sampling(cron_string, execution_timezone)
975+
976+
# Multiple hour values specified
977+
if len(hour_values) > 1:
978+
min_hour_gap = get_smallest_gap(hour_values, wrap_at=24)
979+
if min_hour_gap is not None:
980+
# If days/months/weekdays are all wildcarded, the interval is based on hours
981+
if days_of_month_is_wildcard and months_is_wildcard and days_of_week_is_wildcard:
982+
return datetime.timedelta(hours=min_hour_gap)
983+
# Otherwise, constraints might make it more complex
984+
return _get_smallest_cron_interval_with_sampling(cron_string, execution_timezone)
985+
986+
# Single hour value specified (e.g., "0 0 * * *" - daily at midnight)
987+
if len(hour_values) == 1:
988+
# Daily pattern: specific minute and hour, all days
989+
if days_of_month_is_wildcard and months_is_wildcard and days_of_week_is_wildcard:
990+
return datetime.timedelta(days=1)
991+
992+
# Weekly pattern: specific minute, hour, and day of week
993+
if days_of_month_is_wildcard and months_is_wildcard and len(day_of_week_values) == 1:
994+
return datetime.timedelta(days=7)
995+
996+
# Multiple days of week (e.g., Mon, Wed, Fri)
997+
if days_of_month_is_wildcard and months_is_wildcard and len(day_of_week_values) > 1:
998+
min_dow_gap = get_smallest_gap(day_of_week_values, wrap_at=7)
999+
if min_dow_gap is not None:
1000+
return datetime.timedelta(days=min_dow_gap)
1001+
1002+
# Monthly pattern: specific day of month
1003+
if not days_of_month_is_wildcard and months_is_wildcard and days_of_week_is_wildcard:
1004+
# For monthly patterns, the interval varies (28-31 days depending on the month)
1005+
# Fall back to sampling for accuracy
1006+
return _get_smallest_cron_interval_with_sampling(cron_string, execution_timezone)
1007+
1008+
# Complex pattern with month constraints
1009+
if not months_is_wildcard:
1010+
return _get_smallest_cron_interval_with_sampling(cron_string, execution_timezone)
1011+
1012+
# If we haven't returned yet, fall back to sampling-based approach
1013+
return _get_smallest_cron_interval_with_sampling(cron_string, execution_timezone)
1014+
1015+
1016+
def _get_smallest_cron_interval_with_sampling(
1017+
cron_string: str,
1018+
execution_timezone: Optional[str] = None,
1019+
) -> datetime.timedelta:
1020+
"""Find the smallest interval between cron ticks for a given cron schedule,
1021+
using a sampling-based approach to find the minimum interval by generating
8721022
consecutive cron ticks and measuring the gaps between them. Sampling stops
8731023
early if either of these limits is reached:
8741024
- A maximum of 1000 generated ticks
875-
- A time horizon of 20 years past the sampling start
1025+
- A time horizon of 20 years past the sampling start.
1026+
1027+
This is a fallback for complex patterns that cannot be analyzed deterministically,
1028+
and shouldn't be used for common patterns.
8761029
8771030
Args:
8781031
cron_string: A cron string
@@ -937,6 +1090,12 @@ def get_smallest_cron_interval(
9371090
# We've encountered a genuine zero interval (which shouldn't happen)
9381091
raise Exception("Encountered a genuine zero interval")
9391092

1093+
if interval < datetime.timedelta(seconds=0):
1094+
# This happens when the sampling encounters a daylight savings transition where the clocks roll back
1095+
# Just skip this interval and continue sampling
1096+
prev_tick = current_tick
1097+
continue
1098+
9401099
# Update minimum interval
9411100
if min_interval is None or interval < min_interval:
9421101
min_interval = interval

python_modules/dagster/dagster_tests/scheduler_tests/test_cron_string_iterator.py

Lines changed: 0 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,9 @@
66
from dagster._utils.schedules import (
77
_croniter_string_iterator,
88
cron_string_iterator,
9-
get_smallest_cron_interval,
109
is_valid_cron_string,
1110
reverse_cron_string_iterator,
1211
)
13-
from dagster_shared.check import CheckError
1412

1513

1614
def test_cron_iterator_always_advances():
@@ -678,148 +676,3 @@ def test_invalid_cron_strings():
678676

679677
assert is_valid_cron_string("0 0 31 1 *")
680678
assert not is_valid_cron_string("0 0 32 1 *")
681-
682-
683-
def test_get_smallest_cron_interval_basic():
684-
"""Test basic cron intervals return expected minimums."""
685-
# Minute intervals
686-
assert get_smallest_cron_interval("*/5 * * * *") == datetime.timedelta(minutes=5)
687-
assert get_smallest_cron_interval("*/15 * * * *") == datetime.timedelta(minutes=15)
688-
assert get_smallest_cron_interval("*/30 * * * *") == datetime.timedelta(minutes=30)
689-
690-
# Hourly intervals
691-
assert get_smallest_cron_interval("0 * * * *") == datetime.timedelta(hours=1)
692-
assert get_smallest_cron_interval("0 */6 * * *") == datetime.timedelta(hours=6)
693-
assert get_smallest_cron_interval("0 */12 * * *") == datetime.timedelta(hours=12)
694-
695-
# Daily intervals
696-
assert get_smallest_cron_interval("0 0 * * *") == datetime.timedelta(days=1)
697-
assert get_smallest_cron_interval("30 14 * * *") == datetime.timedelta(days=1)
698-
699-
# Weekly intervals
700-
assert get_smallest_cron_interval("0 0 * * 0") == datetime.timedelta(days=7)
701-
assert get_smallest_cron_interval("0 9 * * 1") == datetime.timedelta(days=7)
702-
703-
704-
def test_get_smallest_cron_interval_irregular():
705-
"""Test irregular cron schedules return correct minimum intervals."""
706-
# Multiple times per hour
707-
interval = get_smallest_cron_interval("15,45 * * * *")
708-
assert interval == datetime.timedelta(minutes=30)
709-
710-
# Multiple times per day
711-
interval = get_smallest_cron_interval("0 9,17 * * *")
712-
assert interval == datetime.timedelta(hours=8)
713-
714-
# Weekdays only
715-
interval = get_smallest_cron_interval("0 9 * * 1-5")
716-
assert interval == datetime.timedelta(days=1) # Daily on weekdays
717-
718-
# Multiple days per week
719-
interval = get_smallest_cron_interval("0 9 * * 1,3,5")
720-
assert interval == datetime.timedelta(days=2) # Mon->Wed->Fri pattern
721-
722-
723-
def test_get_smallest_cron_interval_monthly():
724-
"""Test monthly cron schedules."""
725-
# Monthly on 1st
726-
interval = get_smallest_cron_interval("0 0 1 * *")
727-
# Shortest month interval is 28 days (February)
728-
assert interval == datetime.timedelta(days=28)
729-
730-
# Monthly on 15th
731-
interval = get_smallest_cron_interval("0 12 15 * *")
732-
assert interval == datetime.timedelta(days=28)
733-
734-
735-
def test_get_smallest_cron_interval_leap_year():
736-
"""Test leap year edge case with Feb 29th."""
737-
# Feb 29th only runs on leap years
738-
interval = get_smallest_cron_interval("0 0 29 2 *")
739-
# Should be 1 year for non-leap years, but our sampling should catch 4-year intervals
740-
# during leap year sequences
741-
assert interval.days >= 365 # At least 1 year
742-
743-
# The exact value depends on when we sample, but should be reasonable
744-
assert interval.days <= 4 * 365 + 1 # At most 4 years + leap day
745-
746-
747-
def test_get_smallest_cron_interval_dst_transitions():
748-
"""Test DST transition edge cases."""
749-
# Daily at 2am in a DST timezone - should catch the 23-hour interval during spring forward
750-
interval = get_smallest_cron_interval("0 2 * * *", "America/New_York")
751-
assert interval == datetime.timedelta(hours=23)
752-
753-
# Hourly schedule should not be affected by DST for minimum interval
754-
interval = get_smallest_cron_interval("0 * * * *", "America/New_York")
755-
assert interval == datetime.timedelta(hours=1)
756-
757-
# Different timezone with DST
758-
interval = get_smallest_cron_interval("0 2 * * *", "Europe/Berlin")
759-
assert interval == datetime.timedelta(hours=23)
760-
761-
762-
def test_get_smallest_cron_interval_timezones():
763-
"""Test various timezones work correctly."""
764-
# UTC should work
765-
interval = get_smallest_cron_interval("*/10 * * * *", "UTC")
766-
assert interval == datetime.timedelta(minutes=10)
767-
768-
# Other timezones should work
769-
interval = get_smallest_cron_interval("*/10 * * * *", "Asia/Tokyo")
770-
assert interval == datetime.timedelta(minutes=10)
771-
772-
# Default timezone (UTC) should work
773-
interval = get_smallest_cron_interval("*/10 * * * *")
774-
assert interval == datetime.timedelta(minutes=10)
775-
776-
777-
def test_get_smallest_cron_interval_complex_patterns():
778-
"""Test complex cron patterns."""
779-
# Every 5 minutes during business hours on weekdays
780-
interval = get_smallest_cron_interval("*/5 9-17 * * 1-5")
781-
assert interval == datetime.timedelta(minutes=5)
782-
783-
# Multiple specific times
784-
interval = get_smallest_cron_interval("0,30 8,12,16 * * 1-5")
785-
assert interval == datetime.timedelta(minutes=30)
786-
787-
# Specific day patterns
788-
interval = get_smallest_cron_interval("0 9 1,15 * *") # 1st and 15th of month
789-
# Minimum should be 14 days (15th to 1st of next month can be 14-17 days)
790-
assert interval.days >= 14
791-
assert interval.days <= 17
792-
793-
794-
def test_get_smallest_cron_interval_edge_cases():
795-
"""Test edge cases and error conditions."""
796-
# Invalid cron string should raise error
797-
with pytest.raises(CheckError):
798-
get_smallest_cron_interval("invalid cron")
799-
800-
with pytest.raises(CheckError):
801-
get_smallest_cron_interval("0 0 32 * *") # Invalid day
802-
803-
# Valid but unusual patterns
804-
interval = get_smallest_cron_interval("0 0 * * *") # Daily
805-
assert interval == datetime.timedelta(days=1)
806-
807-
# Very frequent pattern
808-
interval = get_smallest_cron_interval("* * * * *") # Every minute
809-
assert interval == datetime.timedelta(minutes=1)
810-
811-
812-
def test_get_smallest_cron_interval_consistency():
813-
"""Test that the method returns consistent results."""
814-
# Run the same cron string multiple times to ensure consistency
815-
cron_string = "*/15 * * * *"
816-
timezone = "America/Los_Angeles"
817-
818-
results = []
819-
for _ in range(3):
820-
interval = get_smallest_cron_interval(cron_string, timezone)
821-
results.append(interval)
822-
823-
# All results should be the same
824-
assert all(result == results[0] for result in results)
825-
assert results[0] == datetime.timedelta(minutes=15)

0 commit comments

Comments
 (0)