Move JobDatabaseInterface to dedicated submodule #714

soxofaan · soxofaan · commit be3f09ac90e0 · 2025-09-09T18:03:06.000+02:00
diff --git a/docs/cookbook/job_manager.rst b/docs/cookbook/job_manager.rst
@@ -13,7 +13,7 @@ API
 .. autoclass:: openeo.extra.job_management.MultiBackendJobManager
     :members:
 
-.. autoclass:: openeo.extra.job_management.JobDatabaseInterface
+.. autoclass:: openeo.extra.job_management.job_db.JobDatabaseInterface
     :members:
 
 .. autoclass:: openeo.extra.job_management.CsvJobDatabase
diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
@@ -37,6 +37,7 @@
     _JobManagerWorkerThreadPool,
     _JobStartTask,
 )
+from openeo.extra.job_management.job_db import JobDatabaseInterface
 from openeo.internal.processes.parse import (
     Parameter,
     Process,
@@ -64,67 +65,6 @@ class _Backend(NamedTuple):
 _UNSET = object()
 
 
-class JobDatabaseInterface(metaclass=abc.ABCMeta):
-    """
-    Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
-    allowing to regularly persist the job metadata while polling the job statuses
-    and resume/restart the job tracking after it was interrupted.
-
-    .. versionadded:: 0.31.0
-    """
-
-    @abc.abstractmethod
-    def exists(self) -> bool:
-        """Does the job database already exist, to read job data from?"""
-        ...
-
-    @abc.abstractmethod
-    def persist(self, df: pd.DataFrame):
-        """
-        Store (now or updated) job data to the database.
-
-        The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database,
-        so it should be merged with the existing data (if any) instead of overwriting it completely.
-
-        :param df: job data to store.
-        """
-        ...
-
-    @abc.abstractmethod
-    def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
-        """
-        Retrieve the number of jobs per status.
-
-        :param statuses: List/set of statuses to include. If empty, all statuses are included.
-
-        :return: dictionary with status as key and the count as value.
-        """
-        ...
-
-    @abc.abstractmethod
-    def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame:
-        """
-        Returns a dataframe with jobs, filtered by status.
-
-        :param statuses: List of statuses to include.
-        :param max: Maximum number of jobs to return.
-
-        :return: DataFrame with jobs filtered by status.
-        """
-        ...
-
-    @abc.abstractmethod
-    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
-        """
-        Returns a dataframe with jobs based on their (dataframe) index
-
-        :param indices: List of indices to include.
-
-        :return: DataFrame with jobs filtered by indices.
-        """
-        ...
-
-
 def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
     raise NotImplementedError("No 'start_job' callable provided")
 
@@ -367,7 +307,7 @@ def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabas
         :param job_db:
             Job database to load/store existing job status data and other metadata from/to.
             Can be specified as a path to CSV or Parquet file,
-            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
+            or as a custom database object following the :py:class:`job_db.JobDatabaseInterface` interface.
 
             .. note::
                 Support for Parquet files depends on the ``pyarrow`` package
@@ -472,7 +412,7 @@ def run_jobs(
         :param job_db:
             Job database to load/store existing job status data and other metadata from/to.
             Can be specified as a path to CSV or Parquet file,
-            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
+            or as a custom database object following the :py:class:`job_db.JobDatabaseInterface` interface.
 
             .. note::
                 Support for Parquet files depends on the ``pyarrow`` package
@@ -488,7 +428,7 @@ def run_jobs(
         .. versionchanged:: 0.31.0
             Replace ``output_file`` argument with ``job_db`` argument,
             which can be a path to a CSV or Parquet file,
-            or a user-defined :py:class:`JobDatabaseInterface` object.
+            or a user-defined :py:class:`job_db.JobDatabaseInterface` object.
             The deprecated ``output_file`` argument is still supported for now.
 
         .. versionchanged:: 0.33.0
@@ -998,7 +938,7 @@ class CsvJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a CSV file.
 
-    :implements: :py:class:`JobDatabaseInterface`
+    :implements: :py:class:`job_db.JobDatabaseInterface`
     :param path: Path to local CSV file.
 
     .. note::
@@ -1053,7 +993,7 @@ class ParquetJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a Parquet file.
 
-    :implements: :py:class:`JobDatabaseInterface`
+    :implements: :py:class:`job_db.JobDatabaseInterface`
     :param path: Path to the Parquet file.
 
     .. note::
diff --git a/openeo/extra/job_management/job_db.py b/openeo/extra/job_management/job_db.py
@@ -0,0 +1,72 @@
+import abc
+import logging
+from typing import (
+    Iterable,
+    List,
+    Union,
+)
+
+import pandas as pd
+
+_log = logging.getLogger(__name__)
+
+
+class JobDatabaseInterface(metaclass=abc.ABCMeta):
+    """
+    Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
+    allowing to regularly persist the job metadata while polling the job statuses
+    and resume/restart the job tracking after it was interrupted.
+
+    .. versionadded:: 0.31.0
+    """
+
+    @abc.abstractmethod
+    def exists(self) -> bool:
+        """Does the job database already exist, to read job data from?"""
+        ...
+
+    @abc.abstractmethod
+    def persist(self, df: pd.DataFrame):
+        """
+        Store (now or updated) job data to the database.
+
+        The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database,
+        so it should be merged with the existing data (if any) instead of overwriting it completely.
+
+        :param df: job data to store.
+        """
+        ...
+
+    @abc.abstractmethod
+    def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
+        """
+        Retrieve the number of jobs per status.
+
+        :param statuses: List/set of statuses to include. If empty, all statuses are included.
+
+        :return: dictionary with status as key and the count as value.
+        """
+        ...
+
+    @abc.abstractmethod
+    def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs, filtered by status.
+
+        :param statuses: List of statuses to include.
+        :param max: Maximum number of jobs to return.
+
+        :return: DataFrame with jobs filtered by status.
+        """
+        ...
+
+    @abc.abstractmethod
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs based on their (dataframe) index
+
+        :param indices: List of indices to include.
+
+        :return: DataFrame with jobs filtered by indices.
+        """
+        ...
diff --git a/openeo/extra/job_management/stac_job_db.py b/openeo/extra/job_management/stac_job_db.py
@@ -11,7 +11,8 @@
 import requests
 from shapely.geometry import mapping, shape
 
-from openeo.extra.job_management import JobDatabaseInterface, MultiBackendJobManager
+from openeo.extra.job_management import MultiBackendJobManager
+from openeo.extra.job_management.job_db import JobDatabaseInterface
 
 _log = logging.getLogger(__name__)