@@ -57,12 +57,16 @@ def exists(self) -> bool:
5757 def _normalize_df (self , df : pd .DataFrame ) -> pd .DataFrame :
5858 """
5959 Normalize the given dataframe to be compatible with :py:class:`MultiBackendJobManager`
60- by adding the default columns and setting the index.
60+ by adding the default columns and using the STAC item ids as index values .
6161 """
6262 df = MultiBackendJobManager ._normalize_df (df )
63- # If the user doesn't specify the item_id column, we will use the index.
64- if "item_id" not in df .columns :
65- df = df .reset_index (names = ["item_id" ])
63+
64+ if isinstance (df .index , pd .RangeIndex ) and "item_id" in df .columns :
65+ # Support legacy usage: default (autoincrement) index and an "item_id" column -> copy over as index
66+ df .index = df ["item_id" ]
67+
68+ # Make sure the index (of item ids) are strings, to play well with (py)STAC schemas
69+ df .index = df .index .astype (str )
6670 return df
6771
6872 def initialize_from_df (self , df : pd .DataFrame , * , on_exists : str = "error" ):
@@ -128,7 +132,7 @@ def item_from(self, series: pd.Series) -> pystac.Item:
128132 :return: pystac.Item
129133 """
130134 series_dict = series .to_dict ()
131- item_id = series_dict . pop ( "item_id" )
135+ item_id = str ( series . name )
132136 item_dict = {}
133137 item_dict .setdefault ("stac_version" , pystac .get_stac_version ())
134138 item_dict .setdefault ("type" , "Feature" )
@@ -168,6 +172,13 @@ def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
168172 else :
169173 return items ["status" ].value_counts ().to_dict ()
170174
175+ def _search_result_to_df (self , search_result : pystac_client .ItemSearch ) -> pd .DataFrame :
176+ series = [self .series_from (item ) for item in search_result .items ()]
177+ # Note: `series_from` sets the item id as the series "name",
178+ # which ends up in the index of the dataframe
179+ df = pd .DataFrame (series )
180+ return df
181+
171182 def get_by_status (self , statuses : Iterable [str ], max : Optional [int ] = None ) -> pd .DataFrame :
172183 if isinstance (statuses , str ):
173184 statuses = {statuses }
@@ -180,35 +191,45 @@ def get_by_status(self, statuses: Iterable[str], max: Optional[int] = None) -> p
180191 filter = status_filter ,
181192 max_items = max ,
182193 )
194+ df = self ._search_result_to_df (search_results )
183195
184- series = [self .series_from (item ) for item in search_results .items ()]
185-
186- df = pd .DataFrame (series ).reset_index (names = ["item_id" ])
187- if len (series ) == 0 :
196+ if df .shape [0 ] == 0 :
188197 # TODO: What if default columns are overwritten by the user?
189198 df = self ._normalize_df (
190199 df
191200 ) # Even for an empty dataframe the default columns are required
192201 return df
193202
194203 def persist (self , df : pd .DataFrame ):
204+ if df .empty :
205+ _log .warning ("No data to persist in STAC API job database, skipping." )
206+ return
207+
195208 if not self .exists ():
196209 spatial_extent = pystac .SpatialExtent ([[- 180 , - 90 , 180 , 90 ]])
197210 temporal_extent = pystac .TemporalExtent ([[None , None ]])
198211 extent = pystac .Extent (spatial = spatial_extent , temporal = temporal_extent )
199212 c = pystac .Collection (id = self .collection_id , description = "STAC API job database collection." , extent = extent )
200213 self ._create_collection (c )
201214
202- all_items = []
203- if not df .empty :
204-
205- def handle_row (series ):
206- item = self .item_from (series )
207- all_items .append (item )
215+ # Merge updates with existing items (if any)
216+ existing_items = self .client .search (
217+ method = "GET" ,
218+ collections = [self .collection_id ],
219+ ids = [str (i ) for i in df .index .tolist ()],
220+ )
221+ existing_df = self ._search_result_to_df (existing_items )
208222
209- df .apply (handle_row , axis = 1 )
223+ if existing_df .empty :
224+ df_to_persist = df
225+ else :
226+ # Merge data on item_id (in the index)
227+ df_to_persist = existing_df
228+ df_to_persist .update (df , overwrite = True )
210229
211- self ._upload_items_bulk (self .collection_id , all_items )
230+ items_to_persist = [self .item_from (s ) for _ , s in df_to_persist .iterrows ()]
231+ _log .info (f"Bulk upload of { len (items_to_persist )} items to STAC API collection { self .collection_id !r} " )
232+ self ._upload_items_bulk (self .collection_id , items_to_persist )
212233
213234 def _prepare_item (self , item : pystac .Item , collection_id : str ):
214235 item .collection_id = collection_id
0 commit comments