Issue 793: Support partial updates in STACAPIJobDatabase.persist

soxofaan · soxofaan · commit be42cd74ee8b · 2025-08-09T01:05:22.000+02:00
related to PR #736
diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
@@ -538,7 +538,7 @@ def _job_update_loop(
                         self._launch_job(start_job, df=not_started, i=i, backend_name=backend_name, stats=stats)
                         stats["job launch"] += 1
 
-                        job_db.persist(not_started.loc[i : i + 1])
+                        job_db.persist(not_started.loc[[i]])
                         stats["job_db persist"] += 1
                         total_added += 1
 
diff --git a/openeo/extra/job_management/stac_job_db.py b/openeo/extra/job_management/stac_job_db.py
@@ -57,12 +57,16 @@ def exists(self) -> bool:
     def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Normalize the given dataframe to be compatible with :py:class:`MultiBackendJobManager`
-        by adding the default columns and setting the index.
+        by adding the default columns and using the STAC item ids as index values.
         """
         df = MultiBackendJobManager._normalize_df(df)
-        # If the user doesn't specify the item_id column, we will use the index.
-        if "item_id" not in df.columns:
-            df = df.reset_index(names=["item_id"])
+
+        if isinstance(df.index, pd.RangeIndex) and "item_id" in df.columns:
+            # Support legacy usage: default (autoincrement) index and an "item_id" column -> copy over as index
+            df.index = df["item_id"]
+
+        # Make sure the index (of item ids) are strings, to play well with (py)STAC schemas
+        df.index = df.index.astype(str)
         return df
 
     def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
@@ -128,7 +132,7 @@ def item_from(self, series: pd.Series) -> pystac.Item:
         :return: pystac.Item
         """
         series_dict = series.to_dict()
-        item_id = series_dict.pop("item_id")
+        item_id = str(series.name)
         item_dict = {}
         item_dict.setdefault("stac_version", pystac.get_stac_version())
         item_dict.setdefault("type", "Feature")
@@ -168,6 +172,13 @@ def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
         else:
             return items["status"].value_counts().to_dict()
 
+    def _search_result_to_df(self, search_result: pystac_client.ItemSearch) -> pd.DataFrame:
+        series = [self.series_from(item) for item in search_result.items()]
+        # Note: `series_from` sets the item id as the series "name",
+        # which ends up in the index of the dataframe
+        df = pd.DataFrame(series)
+        return df
+
     def get_by_status(self, statuses: Iterable[str], max: Optional[int] = None) -> pd.DataFrame:
         if isinstance(statuses, str):
             statuses = {statuses}
@@ -180,35 +191,45 @@ def get_by_status(self, statuses: Iterable[str], max: Optional[int] = None) -> p
             filter=status_filter,
             max_items=max,
         )
+        df = self._search_result_to_df(search_results)
 
-        series = [self.series_from(item) for item in search_results.items()]
-
-        df = pd.DataFrame(series).reset_index(names=["item_id"])
-        if len(series) == 0:
+        if df.shape[0] == 0:
             # TODO: What if default columns are overwritten by the user?
             df = self._normalize_df(
                 df
             )  # Even for an empty dataframe the default columns are required
         return df
 
     def persist(self, df: pd.DataFrame):
+        if df.empty:
+            _log.warning("No data to persist in STAC API job database, skipping.")
+            return
+
         if not self.exists():
             spatial_extent = pystac.SpatialExtent([[-180, -90, 180, 90]])
             temporal_extent = pystac.TemporalExtent([[None, None]])
             extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
             c = pystac.Collection(id=self.collection_id, description="STAC API job database collection.", extent=extent)
             self._create_collection(c)
 
-        all_items = []
-        if not df.empty:
-
-            def handle_row(series):
-                item = self.item_from(series)
-                all_items.append(item)
+        # Merge updates with existing items (if any)
+        existing_items = self.client.search(
+            method="GET",
+            collections=[self.collection_id],
+            ids=[str(i) for i in df.index.tolist()],
+        )
+        existing_df = self._search_result_to_df(existing_items)
 
-            df.apply(handle_row, axis=1)
+        if existing_df.empty:
+            df_to_persist = df
+        else:
+            # Merge data on item_id (in the index)
+            df_to_persist = existing_df
+            df_to_persist.update(df, overwrite=True)
 
-        self._upload_items_bulk(self.collection_id, all_items)
+        items_to_persist = [self.item_from(s) for _, s in df_to_persist.iterrows()]
+        _log.info(f"Bulk upload of {len(items_to_persist)} items to STAC API collection {self.collection_id!r}")
+        self._upload_items_bulk(self.collection_id, items_to_persist)
 
     def _prepare_item(self, item: pystac.Item, collection_id: str):
         item.collection_id = collection_id
diff --git a/tests/extra/job_management/test_stac_job_db.py b/tests/extra/job_management/test_stac_job_db.py
@@ -479,21 +479,25 @@ def _post_collections_bulk_items(self, request, context):
     def _get_search(self, request, context):
         """Handler of `GET /search` requests."""
         collections = request.qs["collections"][0].split(",")
-        filter = request.qs["filter"][0] if "filter" in request.qs else None
-
-        if filter:
-            # TODO: use a more robust CQL2-text parser?
-            assert re.match(r"^\s*\"properties\.status\"='\w+'(\s+or\s+\"properties\.status\"='\w+')*\s*$", filter)
-            statuses = re.findall(r"\"properties\.status\"='(\w+)'", filter)
-        else:
-            statuses = None
-
         items = [
             item
             for cid in collections
             for item in self.items.get(cid, {}).values()
-            if statuses is None or item.get("properties", {}).get("status") in statuses
         ]
+        if "ids" in request.qs:
+            [ids] = request.qs["ids"]
+            ids = set(ids.split(","))
+            items = [i for i in items if i.get("id") in ids]
+        if "filter" in request.qs:
+            [property_filter] = request.qs["filter"]
+            # TODO: use a more robust CQL2-text parser?
+            assert request.qs["filter-lang"] == ["cql2-text"]
+            assert re.match(
+                r"^\s*\"properties\.status\"='\w+'(\s+or\s+\"properties\.status\"='\w+')*\s*$", property_filter
+            )
+            statuses = set(re.findall(r"\"properties\.status\"='(\w+)'", property_filter))
+            items = [i for i in items if i.get("properties", {}).get("status") in statuses]
+
         return {
             "type": "FeatureCollection",
             "features": items,
@@ -502,27 +506,59 @@ def _get_search(self, request, context):
 
 
 def test_run_jobs_basic(tmp_path, dummy_backend_foo, requests_mock, sleep_mock):
-    job_manager = MultiBackendJobManager(root_dir=tmp_path, poll_sleep=2)
-    job_manager.add_backend("foo", connection=dummy_backend_foo.connection)
-
     stac_api_url = "http://stacapi.test"
     dummy_stac_api = DummyStacApi(root_url=stac_api_url, requests_mock=requests_mock)
 
+    # Initialize job db
     job_db = STACAPIJobDatabase(collection_id="collection-123", stac_root_url=stac_api_url)
     df = pd.DataFrame(
-        {
-            "item_id": ["item-2024", "item-2025"],
-            "year": [2024, 2025],
-        }
+        {"year": [2024, 2025]},
+        index=["item-2024", "item-2025"],
     )
     job_db.initialize_from_df(df=df)
+    assert dummy_stac_api.items == {
+        "collection-123": {
+            "item-2024": dirty_equals.IsPartialDict(
+                {
+                    "type": "Feature",
+                    "id": "item-2024",
+                    "properties": dirty_equals.IsPartialDict(
+                        {
+                            "year": 2024,
+                            "id": None,
+                            "status": "not_started",
+                            "backend_name": None,
+                        }
+                    ),
+                }
+            ),
+            "item-2025": dirty_equals.IsPartialDict(
+                {
+                    "type": "Feature",
+                    "id": "item-2025",
+                    "properties": dirty_equals.IsPartialDict(
+                        {
+                            "year": 2025,
+                            "id": None,
+                            "status": "not_started",
+                            "backend_name": None,
+                        }
+                    ),
+                }
+            ),
+        }
+    }
 
+    # Set up job manager
+    job_manager = MultiBackendJobManager(root_dir=tmp_path, poll_sleep=2)
+    job_manager.add_backend("foo", connection=dummy_backend_foo.connection)
+
+    # Run job manager loop
     def create_job(row, connection, **kwargs):
         year = int(row["year"])
         pg = {"dummy1": {"process_id": "dummy", "arguments": {"year": year}, "result": True}}
         job = connection.create_job(pg)
         return job
-
     run_stats = job_manager.run_jobs(job_db=job_db, start_job=create_job)
 
     assert run_stats == dirty_equals.IsPartialDict(