WIP "post_dry_run" concept for #406

soxofaan · soxofaan · commit f16920c7eedd · 2025-11-28T10:20:28.000+01:00
diff --git a/openeo_driver/ProcessGraphDeserializer.py b/openeo_driver/ProcessGraphDeserializer.py
@@ -469,16 +469,33 @@ def evaluate(
     if do_dry_run:
         dry_run_tracer = do_dry_run if isinstance(do_dry_run, DryRunDataTracer) else DryRunDataTracer()
         _log.info("Doing dry run")
-        convert_node(top_level_node, env=env.push({
-            ENV_DRY_RUN_TRACER: dry_run_tracer,
-            ENV_SAVE_RESULT: [],  # otherwise dry run and real run append to the same mutable result list
-            "node_caching": False
-        }))
+        dry_run_result = convert_node(
+            top_level_node,
+            env=env.push(
+                {
+                    ENV_DRY_RUN_TRACER: dry_run_tracer,
+                    ENV_SAVE_RESULT: [],  # otherwise dry run and real run append to the same mutable result list
+                    "node_caching": False,
+                }
+            ),
+        )
         # TODO: work with a dedicated DryRunEvalEnv?
         source_constraints = dry_run_tracer.get_source_constraints()
         _log.info("Dry run extracted these source constraints: {s}".format(s=source_constraints))
+
+        # TODO: Given the post-dry-run hook concept: is it still necessary to push source_constraints into env?
         env = env.push({ENV_SOURCE_CONSTRAINTS: source_constraints})
 
+        post_dry_run_data = env.backend_implementation.post_dry_run(
+            dry_run_result=dry_run_result,
+            dry_run_tracer=dry_run_tracer,
+            source_constraints=source_constraints,
+            # TODO: use env before ENV_SOURCE_CONSTRAINTS have been pushed?
+            env=env,
+        )
+        if post_dry_run_data:
+            env = env.push(post_dry_run_data)
+
     result = convert_node(top_level_node, env=env)
     if len(env[ENV_SAVE_RESULT]) > 0:
         if len(env[ENV_SAVE_RESULT]) == 1:
@@ -763,17 +780,17 @@ def _extract_load_parameters(env: EvalEnv, source_id: tuple) -> LoadParameters:
 
             if extent is not None:
                 collection_crs = _collection_crs(collection_id=collection_id, env=env)
-                crs = constraint.get("resample", {}).get("target_crs", collection_crs) or collection_crs
+                target_crs = constraint.get("resample", {}).get("target_crs", collection_crs) or collection_crs
                 target_resolution = constraint.get("resample", {}).get("resolution", None) or _collection_resolution(
                     collection_id=collection_id, env=env
                 )
 
                 if "pixel_buffer" in constraint:
                     buffer = constraint["pixel_buffer"]["buffer_size"]
 
-                    if (crs is not None) and target_resolution:
+                    if (target_crs is not None) and target_resolution:
                         bbox = BoundingBox.from_dict(extent, default_crs=4326)
-                        extent = bbox.reproject(crs).as_dict()
+                        extent = bbox.reproject(target_crs).as_dict()
 
                         extent = {
                             "west": extent["west"] - target_resolution[0] * math.ceil(buffer[0]),
@@ -785,11 +802,11 @@ def _extract_load_parameters(env: EvalEnv, source_id: tuple) -> LoadParameters:
                     else:
                         _log.warning("Not applying buffer to extent because the target CRS is not known.")
 
-                load_collection_in_native_grid = "resample" not in constraint or crs == collection_crs
+                load_collection_in_native_grid = "resample" not in constraint or target_crs == collection_crs
                 if (not load_collection_in_native_grid) and collection_crs is not None and ("42001" in str(collection_crs)):
                     #resampling auto utm to utm means we are loading in native grid
                     try:
-                        load_collection_in_native_grid = "UTM zone" in CRS.from_user_input(crs).to_wkt()
+                        load_collection_in_native_grid = "UTM zone" in CRS.from_user_input(target_crs).to_wkt()
                     except CRSError as e:
                         pass
 
@@ -2608,7 +2625,7 @@ def load_stac(args: ProcessArgs, env: EvalEnv) -> DriverDataCube:
 
     dry_run_tracer: DryRunDataTracer = env.get(ENV_DRY_RUN_TRACER)
     if dry_run_tracer:
-        return dry_run_tracer.load_stac(url, arguments, env)
+        return dry_run_tracer.load_stac(url=url, arguments=arguments, env=env)
     else:
         source_id = dry_run.DataSource.load_stac(
             url, properties=arguments.get("properties", {}), bands=arguments.get("bands", []), env=env
diff --git a/openeo_driver/backend.py b/openeo_driver/backend.py
@@ -16,7 +16,7 @@
 import sys
 from datetime import datetime, timedelta
 from pathlib import Path
-from typing import List, Union, NamedTuple, Dict, Optional, Callable, Iterable, Container
+from typing import List, Union, NamedTuple, Dict, Optional, Callable, Iterable, Container, Any
 
 import flask
 
@@ -28,6 +28,7 @@
 from openeo_driver.config import OpenEoBackendConfig, get_backend_config
 from openeo_driver.datacube import DriverDataCube, DriverMlModel, DriverVectorCube
 from openeo_driver.datastructs import SarBackscatterArgs
+from openeo_driver.dry_run import DryRunDataTracer, SourceConstraint, DryRunDataCube
 from openeo_driver.errors import CollectionNotFoundException, ServiceUnsupportedException, FeatureUnsupportedException
 from openeo_driver.constants import JOB_STATUS, DEFAULT_LOG_LEVEL_RETRIEVAL
 from openeo_driver.processes import ProcessRegistry
@@ -1004,6 +1005,32 @@ def request_costs(
         """
         return None
 
+    def post_dry_run(
+        self,
+        *,
+        dry_run_result: Union[DryRunDataCube, Any],
+        dry_run_tracer: DryRunDataTracer,
+        source_constraints: List[SourceConstraint],
+        env: EvalEnv,
+    ) -> Union[None, dict]:
+        """
+        Hook to analyse the outcome of a full dry-run evaluation
+        and set some additional "global" EvalEnv state for the wet run evaluation.
+        For example, to set process-graph-wide, preferred processing hints/directives
+        about CRS, projection, resolution, alignment, partitioning, ...
+
+        This is an experimental API, the set of available arguments is still in flux.
+
+        :param dry_run_result: result of dry-run evaluation of process graph (typically a DryRunDataCube)
+        :param dry_run_tracer: tracer used in dry-run evaluation
+        :param env: EvalEnv as used in dry-run evaluation
+        :param source_constraints:
+
+        :return: dict with extra state to push to the EvalEnv before triggering the wet run,
+            or None (to push nothing).
+        """
+        return None
+
 
 def function_has_argument(function: Callable, argument: str) -> bool:
     """Does function support given argument?"""
diff --git a/openeo_driver/dry_run.py b/openeo_driver/dry_run.py
@@ -12,22 +12,22 @@
 but pushing around dummy data cubes.
 
 The architecture consists of these classes:
-- DataTrace: starts from a `load_collection` (or other source) process and records what happens to this
-    single data source (filter_temporal, filter_bbox, ...)
+- DataTrace: starts from a `load_collection`, `load_stac` or other source process
+    and records what happens to this single data source (filter_temporal, filter_bbox, ...)
 - DryRunDataTracer: observer that keeps track of all data traces during a dry run
-- DryRunDataCube: dummy data cube that is passed around in processed
+- DryRunDataCube: dummy data cube that is passed around durin (dry-run) processing
 
 Their relationship is as follows:
 - There is a single DryRunDataTracer for a dry-run, keeping track of all relevant operations on all sources
 - A DryRunDataCube links to one or more DataTraces, describing the operations that happened
     on the sources that lead to the state of the DryRunDataCube. Often there is just one DataTrace
     in a DryRunDataCube, but when the DryRunDataCube is result of mask or merge_cubes operations,
     there will be multiple DataTraces.
-    A DryRunDataCube also has a reference to the DryRunDataTracer in play, so that it can be informed
-    when processes are applied to the DryRunDataCube.
+    A DryRunDataCube also has a reference to the DryRunDataTracer in play,
+    so that this tracer can be informed when processes are applied to the DryRunDataCube.
 
 When the dry-run phase is done, the DryRunDataTracer knows about all relevant operations
-on each data source. It provides methods for example to extract source constraints (bbox/bands/date ranges)
+on each data source. It provides methods to extract source constraints (bbox/bands/date ranges)
 which are used to bootstrap the EvalEnv that is used for the real process graph processing phase.
 These source constraints can then be fetched from the EvalEnv at `load_collection` time.
 
@@ -205,7 +205,9 @@ def load_stac(cls, url: str, properties={}, bands=[], env=EvalEnv()) -> "DataSou
 
 class DataTrace(DataTraceBase):
     """
-    Processed data: linked list of processes, ending at a data source node.
+    Processed data: chain of processes/operations (with arguments),
+    linked together through parent-child relations,
+    originating from a source node `DataSource` (final parent in the chain).
 
     Note: this is not the same as a data cube, as a data cube can be combination of multiple data
     traces (e.g. after mask or merge process).