@@ -49,128 +49,149 @@ def __init__(self):
4949 self .storage = storage
5050 self .model_manager = ModelManager ()
5151
52+ def _handle_indexing_error (self , document_id : str , error : Exception ) -> None :
53+ """Handle indexing errors by updating document status."""
54+ logger .exception ("consume document failed" )
55+ document = db .session .get (DatasetDocument , document_id )
56+ if document :
57+ document .indexing_status = "error"
58+ error_message = getattr (error , "description" , str (error ))
59+ document .error = str (error_message )
60+ document .stopped_at = naive_utc_now ()
61+ db .session .commit ()
62+
5263 def run (self , dataset_documents : list [DatasetDocument ]):
5364 """Run the indexing process."""
5465 for dataset_document in dataset_documents :
66+ document_id = dataset_document .id
5567 try :
68+ # Re-query the document to ensure it's bound to the current session
69+ requeried_document = db .session .get (DatasetDocument , document_id )
70+ if not requeried_document :
71+ logger .warning ("Document not found, skipping document id: %s" , document_id )
72+ continue
73+
5674 # get dataset
57- dataset = db .session .query (Dataset ).filter_by (id = dataset_document .dataset_id ).first ()
75+ dataset = db .session .query (Dataset ).filter_by (id = requeried_document .dataset_id ).first ()
5876
5977 if not dataset :
6078 raise ValueError ("no dataset found" )
6179 # get the process rule
6280 stmt = select (DatasetProcessRule ).where (
63- DatasetProcessRule .id == dataset_document .dataset_process_rule_id
81+ DatasetProcessRule .id == requeried_document .dataset_process_rule_id
6482 )
6583 processing_rule = db .session .scalar (stmt )
6684 if not processing_rule :
6785 raise ValueError ("no process rule found" )
68- index_type = dataset_document .doc_form
86+ index_type = requeried_document .doc_form
6987 index_processor = IndexProcessorFactory (index_type ).init_index_processor ()
7088 # extract
71- text_docs = self ._extract (index_processor , dataset_document , processing_rule .to_dict ())
89+ text_docs = self ._extract (index_processor , requeried_document , processing_rule .to_dict ())
7290
7391 # transform
7492 documents = self ._transform (
75- index_processor , dataset , text_docs , dataset_document .doc_language , processing_rule .to_dict ()
93+ index_processor , dataset , text_docs , requeried_document .doc_language , processing_rule .to_dict ()
7694 )
7795 # save segment
78- self ._load_segments (dataset , dataset_document , documents )
96+ self ._load_segments (dataset , requeried_document , documents )
7997
8098 # load
8199 self ._load (
82100 index_processor = index_processor ,
83101 dataset = dataset ,
84- dataset_document = dataset_document ,
102+ dataset_document = requeried_document ,
85103 documents = documents ,
86104 )
87105 except DocumentIsPausedError :
88- raise DocumentIsPausedError (f"Document paused, document id: { dataset_document . id } " )
106+ raise DocumentIsPausedError (f"Document paused, document id: { document_id } " )
89107 except ProviderTokenNotInitError as e :
90- dataset_document .indexing_status = "error"
91- dataset_document .error = str (e .description )
92- dataset_document .stopped_at = naive_utc_now ()
93- db .session .commit ()
108+ self ._handle_indexing_error (document_id , e )
94109 except ObjectDeletedError :
95- logger .warning ("Document deleted, document id: %s" , dataset_document . id )
110+ logger .warning ("Document deleted, document id: %s" , document_id )
96111 except Exception as e :
97- logger .exception ("consume document failed" )
98- dataset_document .indexing_status = "error"
99- dataset_document .error = str (e )
100- dataset_document .stopped_at = naive_utc_now ()
101- db .session .commit ()
112+ self ._handle_indexing_error (document_id , e )
102113
103114 def run_in_splitting_status (self , dataset_document : DatasetDocument ):
104115 """Run the indexing process when the index_status is splitting."""
116+ document_id = dataset_document .id
105117 try :
118+ # Re-query the document to ensure it's bound to the current session
119+ requeried_document = db .session .get (DatasetDocument , document_id )
120+ if not requeried_document :
121+ logger .warning ("Document not found: %s" , document_id )
122+ return
123+
106124 # get dataset
107- dataset = db .session .query (Dataset ).filter_by (id = dataset_document .dataset_id ).first ()
125+ dataset = db .session .query (Dataset ).filter_by (id = requeried_document .dataset_id ).first ()
108126
109127 if not dataset :
110128 raise ValueError ("no dataset found" )
111129
112130 # get exist document_segment list and delete
113131 document_segments = (
114132 db .session .query (DocumentSegment )
115- .filter_by (dataset_id = dataset .id , document_id = dataset_document .id )
133+ .filter_by (dataset_id = dataset .id , document_id = requeried_document .id )
116134 .all ()
117135 )
118136
119137 for document_segment in document_segments :
120138 db .session .delete (document_segment )
121- if dataset_document .doc_form == IndexType .PARENT_CHILD_INDEX :
139+ if requeried_document .doc_form == IndexType .PARENT_CHILD_INDEX :
122140 # delete child chunks
123141 db .session .query (ChildChunk ).where (ChildChunk .segment_id == document_segment .id ).delete ()
124142 db .session .commit ()
125143 # get the process rule
126- stmt = select (DatasetProcessRule ).where (DatasetProcessRule .id == dataset_document .dataset_process_rule_id )
144+ stmt = select (DatasetProcessRule ).where (DatasetProcessRule .id == requeried_document .dataset_process_rule_id )
127145 processing_rule = db .session .scalar (stmt )
128146 if not processing_rule :
129147 raise ValueError ("no process rule found" )
130148
131- index_type = dataset_document .doc_form
149+ index_type = requeried_document .doc_form
132150 index_processor = IndexProcessorFactory (index_type ).init_index_processor ()
133151 # extract
134- text_docs = self ._extract (index_processor , dataset_document , processing_rule .to_dict ())
152+ text_docs = self ._extract (index_processor , requeried_document , processing_rule .to_dict ())
135153
136154 # transform
137155 documents = self ._transform (
138- index_processor , dataset , text_docs , dataset_document .doc_language , processing_rule .to_dict ()
156+ index_processor , dataset , text_docs , requeried_document .doc_language , processing_rule .to_dict ()
139157 )
140158 # save segment
141- self ._load_segments (dataset , dataset_document , documents )
159+ self ._load_segments (dataset , requeried_document , documents )
142160
143161 # load
144162 self ._load (
145- index_processor = index_processor , dataset = dataset , dataset_document = dataset_document , documents = documents
163+ index_processor = index_processor ,
164+ dataset = dataset ,
165+ dataset_document = requeried_document ,
166+ documents = documents ,
146167 )
147168 except DocumentIsPausedError :
148- raise DocumentIsPausedError (f"Document paused, document id: { dataset_document . id } " )
169+ raise DocumentIsPausedError (f"Document paused, document id: { document_id } " )
149170 except ProviderTokenNotInitError as e :
150- dataset_document .indexing_status = "error"
151- dataset_document .error = str (e .description )
152- dataset_document .stopped_at = naive_utc_now ()
153- db .session .commit ()
171+ self ._handle_indexing_error (document_id , e )
154172 except Exception as e :
155- logger .exception ("consume document failed" )
156- dataset_document .indexing_status = "error"
157- dataset_document .error = str (e )
158- dataset_document .stopped_at = naive_utc_now ()
159- db .session .commit ()
173+ self ._handle_indexing_error (document_id , e )
160174
161175 def run_in_indexing_status (self , dataset_document : DatasetDocument ):
162176 """Run the indexing process when the index_status is indexing."""
177+ document_id = dataset_document .id
163178 try :
179+ # Re-query the document to ensure it's bound to the current session
180+ requeried_document = db .session .get (DatasetDocument , document_id )
181+ if not requeried_document :
182+ logger .warning ("Document not found: %s" , document_id )
183+ return
184+
164185 # get dataset
165- dataset = db .session .query (Dataset ).filter_by (id = dataset_document .dataset_id ).first ()
186+ dataset = db .session .query (Dataset ).filter_by (id = requeried_document .dataset_id ).first ()
166187
167188 if not dataset :
168189 raise ValueError ("no dataset found" )
169190
170191 # get exist document_segment list and delete
171192 document_segments = (
172193 db .session .query (DocumentSegment )
173- .filter_by (dataset_id = dataset .id , document_id = dataset_document .id )
194+ .filter_by (dataset_id = dataset .id , document_id = requeried_document .id )
174195 .all ()
175196 )
176197
@@ -188,7 +209,7 @@ def run_in_indexing_status(self, dataset_document: DatasetDocument):
188209 "dataset_id" : document_segment .dataset_id ,
189210 },
190211 )
191- if dataset_document .doc_form == IndexType .PARENT_CHILD_INDEX :
212+ if requeried_document .doc_form == IndexType .PARENT_CHILD_INDEX :
192213 child_chunks = document_segment .get_child_chunks ()
193214 if child_chunks :
194215 child_documents = []
@@ -206,24 +227,20 @@ def run_in_indexing_status(self, dataset_document: DatasetDocument):
206227 document .children = child_documents
207228 documents .append (document )
208229 # build index
209- index_type = dataset_document .doc_form
230+ index_type = requeried_document .doc_form
210231 index_processor = IndexProcessorFactory (index_type ).init_index_processor ()
211232 self ._load (
212- index_processor = index_processor , dataset = dataset , dataset_document = dataset_document , documents = documents
233+ index_processor = index_processor ,
234+ dataset = dataset ,
235+ dataset_document = requeried_document ,
236+ documents = documents ,
213237 )
214238 except DocumentIsPausedError :
215- raise DocumentIsPausedError (f"Document paused, document id: { dataset_document . id } " )
239+ raise DocumentIsPausedError (f"Document paused, document id: { document_id } " )
216240 except ProviderTokenNotInitError as e :
217- dataset_document .indexing_status = "error"
218- dataset_document .error = str (e .description )
219- dataset_document .stopped_at = naive_utc_now ()
220- db .session .commit ()
241+ self ._handle_indexing_error (document_id , e )
221242 except Exception as e :
222- logger .exception ("consume document failed" )
223- dataset_document .indexing_status = "error"
224- dataset_document .error = str (e )
225- dataset_document .stopped_at = naive_utc_now ()
226- db .session .commit ()
243+ self ._handle_indexing_error (document_id , e )
227244
228245 def indexing_estimate (
229246 self ,
0 commit comments