Skip to content

Commit 34fbcc9

Browse files
authored
fix: ensure document re-querying in indexing process for consistency (#27077)
1 parent 9cc8ac9 commit 34fbcc9

File tree

1 file changed

+69
-52
lines changed

1 file changed

+69
-52
lines changed

api/core/indexing_runner.py

Lines changed: 69 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -49,128 +49,149 @@ def __init__(self):
4949
self.storage = storage
5050
self.model_manager = ModelManager()
5151

52+
def _handle_indexing_error(self, document_id: str, error: Exception) -> None:
53+
"""Handle indexing errors by updating document status."""
54+
logger.exception("consume document failed")
55+
document = db.session.get(DatasetDocument, document_id)
56+
if document:
57+
document.indexing_status = "error"
58+
error_message = getattr(error, "description", str(error))
59+
document.error = str(error_message)
60+
document.stopped_at = naive_utc_now()
61+
db.session.commit()
62+
5263
def run(self, dataset_documents: list[DatasetDocument]):
5364
"""Run the indexing process."""
5465
for dataset_document in dataset_documents:
66+
document_id = dataset_document.id
5567
try:
68+
# Re-query the document to ensure it's bound to the current session
69+
requeried_document = db.session.get(DatasetDocument, document_id)
70+
if not requeried_document:
71+
logger.warning("Document not found, skipping document id: %s", document_id)
72+
continue
73+
5674
# get dataset
57-
dataset = db.session.query(Dataset).filter_by(id=dataset_document.dataset_id).first()
75+
dataset = db.session.query(Dataset).filter_by(id=requeried_document.dataset_id).first()
5876

5977
if not dataset:
6078
raise ValueError("no dataset found")
6179
# get the process rule
6280
stmt = select(DatasetProcessRule).where(
63-
DatasetProcessRule.id == dataset_document.dataset_process_rule_id
81+
DatasetProcessRule.id == requeried_document.dataset_process_rule_id
6482
)
6583
processing_rule = db.session.scalar(stmt)
6684
if not processing_rule:
6785
raise ValueError("no process rule found")
68-
index_type = dataset_document.doc_form
86+
index_type = requeried_document.doc_form
6987
index_processor = IndexProcessorFactory(index_type).init_index_processor()
7088
# extract
71-
text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict())
89+
text_docs = self._extract(index_processor, requeried_document, processing_rule.to_dict())
7290

7391
# transform
7492
documents = self._transform(
75-
index_processor, dataset, text_docs, dataset_document.doc_language, processing_rule.to_dict()
93+
index_processor, dataset, text_docs, requeried_document.doc_language, processing_rule.to_dict()
7694
)
7795
# save segment
78-
self._load_segments(dataset, dataset_document, documents)
96+
self._load_segments(dataset, requeried_document, documents)
7997

8098
# load
8199
self._load(
82100
index_processor=index_processor,
83101
dataset=dataset,
84-
dataset_document=dataset_document,
102+
dataset_document=requeried_document,
85103
documents=documents,
86104
)
87105
except DocumentIsPausedError:
88-
raise DocumentIsPausedError(f"Document paused, document id: {dataset_document.id}")
106+
raise DocumentIsPausedError(f"Document paused, document id: {document_id}")
89107
except ProviderTokenNotInitError as e:
90-
dataset_document.indexing_status = "error"
91-
dataset_document.error = str(e.description)
92-
dataset_document.stopped_at = naive_utc_now()
93-
db.session.commit()
108+
self._handle_indexing_error(document_id, e)
94109
except ObjectDeletedError:
95-
logger.warning("Document deleted, document id: %s", dataset_document.id)
110+
logger.warning("Document deleted, document id: %s", document_id)
96111
except Exception as e:
97-
logger.exception("consume document failed")
98-
dataset_document.indexing_status = "error"
99-
dataset_document.error = str(e)
100-
dataset_document.stopped_at = naive_utc_now()
101-
db.session.commit()
112+
self._handle_indexing_error(document_id, e)
102113

103114
def run_in_splitting_status(self, dataset_document: DatasetDocument):
104115
"""Run the indexing process when the index_status is splitting."""
116+
document_id = dataset_document.id
105117
try:
118+
# Re-query the document to ensure it's bound to the current session
119+
requeried_document = db.session.get(DatasetDocument, document_id)
120+
if not requeried_document:
121+
logger.warning("Document not found: %s", document_id)
122+
return
123+
106124
# get dataset
107-
dataset = db.session.query(Dataset).filter_by(id=dataset_document.dataset_id).first()
125+
dataset = db.session.query(Dataset).filter_by(id=requeried_document.dataset_id).first()
108126

109127
if not dataset:
110128
raise ValueError("no dataset found")
111129

112130
# get exist document_segment list and delete
113131
document_segments = (
114132
db.session.query(DocumentSegment)
115-
.filter_by(dataset_id=dataset.id, document_id=dataset_document.id)
133+
.filter_by(dataset_id=dataset.id, document_id=requeried_document.id)
116134
.all()
117135
)
118136

119137
for document_segment in document_segments:
120138
db.session.delete(document_segment)
121-
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
139+
if requeried_document.doc_form == IndexType.PARENT_CHILD_INDEX:
122140
# delete child chunks
123141
db.session.query(ChildChunk).where(ChildChunk.segment_id == document_segment.id).delete()
124142
db.session.commit()
125143
# get the process rule
126-
stmt = select(DatasetProcessRule).where(DatasetProcessRule.id == dataset_document.dataset_process_rule_id)
144+
stmt = select(DatasetProcessRule).where(DatasetProcessRule.id == requeried_document.dataset_process_rule_id)
127145
processing_rule = db.session.scalar(stmt)
128146
if not processing_rule:
129147
raise ValueError("no process rule found")
130148

131-
index_type = dataset_document.doc_form
149+
index_type = requeried_document.doc_form
132150
index_processor = IndexProcessorFactory(index_type).init_index_processor()
133151
# extract
134-
text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict())
152+
text_docs = self._extract(index_processor, requeried_document, processing_rule.to_dict())
135153

136154
# transform
137155
documents = self._transform(
138-
index_processor, dataset, text_docs, dataset_document.doc_language, processing_rule.to_dict()
156+
index_processor, dataset, text_docs, requeried_document.doc_language, processing_rule.to_dict()
139157
)
140158
# save segment
141-
self._load_segments(dataset, dataset_document, documents)
159+
self._load_segments(dataset, requeried_document, documents)
142160

143161
# load
144162
self._load(
145-
index_processor=index_processor, dataset=dataset, dataset_document=dataset_document, documents=documents
163+
index_processor=index_processor,
164+
dataset=dataset,
165+
dataset_document=requeried_document,
166+
documents=documents,
146167
)
147168
except DocumentIsPausedError:
148-
raise DocumentIsPausedError(f"Document paused, document id: {dataset_document.id}")
169+
raise DocumentIsPausedError(f"Document paused, document id: {document_id}")
149170
except ProviderTokenNotInitError as e:
150-
dataset_document.indexing_status = "error"
151-
dataset_document.error = str(e.description)
152-
dataset_document.stopped_at = naive_utc_now()
153-
db.session.commit()
171+
self._handle_indexing_error(document_id, e)
154172
except Exception as e:
155-
logger.exception("consume document failed")
156-
dataset_document.indexing_status = "error"
157-
dataset_document.error = str(e)
158-
dataset_document.stopped_at = naive_utc_now()
159-
db.session.commit()
173+
self._handle_indexing_error(document_id, e)
160174

161175
def run_in_indexing_status(self, dataset_document: DatasetDocument):
162176
"""Run the indexing process when the index_status is indexing."""
177+
document_id = dataset_document.id
163178
try:
179+
# Re-query the document to ensure it's bound to the current session
180+
requeried_document = db.session.get(DatasetDocument, document_id)
181+
if not requeried_document:
182+
logger.warning("Document not found: %s", document_id)
183+
return
184+
164185
# get dataset
165-
dataset = db.session.query(Dataset).filter_by(id=dataset_document.dataset_id).first()
186+
dataset = db.session.query(Dataset).filter_by(id=requeried_document.dataset_id).first()
166187

167188
if not dataset:
168189
raise ValueError("no dataset found")
169190

170191
# get exist document_segment list and delete
171192
document_segments = (
172193
db.session.query(DocumentSegment)
173-
.filter_by(dataset_id=dataset.id, document_id=dataset_document.id)
194+
.filter_by(dataset_id=dataset.id, document_id=requeried_document.id)
174195
.all()
175196
)
176197

@@ -188,7 +209,7 @@ def run_in_indexing_status(self, dataset_document: DatasetDocument):
188209
"dataset_id": document_segment.dataset_id,
189210
},
190211
)
191-
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
212+
if requeried_document.doc_form == IndexType.PARENT_CHILD_INDEX:
192213
child_chunks = document_segment.get_child_chunks()
193214
if child_chunks:
194215
child_documents = []
@@ -206,24 +227,20 @@ def run_in_indexing_status(self, dataset_document: DatasetDocument):
206227
document.children = child_documents
207228
documents.append(document)
208229
# build index
209-
index_type = dataset_document.doc_form
230+
index_type = requeried_document.doc_form
210231
index_processor = IndexProcessorFactory(index_type).init_index_processor()
211232
self._load(
212-
index_processor=index_processor, dataset=dataset, dataset_document=dataset_document, documents=documents
233+
index_processor=index_processor,
234+
dataset=dataset,
235+
dataset_document=requeried_document,
236+
documents=documents,
213237
)
214238
except DocumentIsPausedError:
215-
raise DocumentIsPausedError(f"Document paused, document id: {dataset_document.id}")
239+
raise DocumentIsPausedError(f"Document paused, document id: {document_id}")
216240
except ProviderTokenNotInitError as e:
217-
dataset_document.indexing_status = "error"
218-
dataset_document.error = str(e.description)
219-
dataset_document.stopped_at = naive_utc_now()
220-
db.session.commit()
241+
self._handle_indexing_error(document_id, e)
221242
except Exception as e:
222-
logger.exception("consume document failed")
223-
dataset_document.indexing_status = "error"
224-
dataset_document.error = str(e)
225-
dataset_document.stopped_at = naive_utc_now()
226-
db.session.commit()
243+
self._handle_indexing_error(document_id, e)
227244

228245
def indexing_estimate(
229246
self,

0 commit comments

Comments
 (0)