Skip to content

Commit 146e084

Browse files
Copilotsinedied
andcommitted
Implement document deduplication for both Azure CosmosDB and FAISS paths
Co-authored-by: sinedied <[email protected]>
1 parent 5577198 commit 146e084

File tree

1 file changed

+37
-1
lines changed

1 file changed

+37
-1
lines changed

packages/api/src/functions/documents-post.ts

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,31 @@ export async function postDocuments(request: HttpRequest, context: InvocationCon
5050

5151
// Initialize embeddings model and vector database
5252
const embeddings = new AzureOpenAIEmbeddings({ azureADTokenProvider });
53-
await AzureCosmosDBNoSQLVectorStore.fromDocuments(documents, embeddings, { credentials });
53+
const store = await AzureCosmosDBNoSQLVectorStore.fromDocuments([], embeddings, { credentials });
54+
55+
// Remove existing documents with the same filename to avoid duplicates
56+
try {
57+
await store.delete({
58+
filter: `SELECT * FROM c WHERE c.metadata.source = "${filename.replaceAll('"', '\\"')}"`,
59+
});
60+
} catch (error: unknown) {
61+
// If deletion fails (e.g., container doesn't exist yet), just log and continue
62+
context.log(`Warning: Could not delete existing documents: ${(error as Error).message}`);
63+
}
64+
65+
// Add the new documents
66+
await store.addDocuments(documents);
5467
} else {
5568
// If no environment variables are set, it means we are running locally
5669
context.log('No Azure OpenAI endpoint set, using Ollama models and local DB');
5770
const embeddings = new OllamaEmbeddings({ model: ollamaEmbeddingsModel });
5871
const folderExists = await checkFolderExists(faissStoreFolder);
5972
if (folderExists) {
6073
const store = await FaissStore.load(faissStoreFolder, embeddings);
74+
75+
// Remove existing documents with the same filename to avoid duplicates
76+
await removeDuplicateDocuments(store, filename);
77+
6178
await store.addDocuments(documents);
6279
await store.save(faissStoreFolder);
6380
} else {
@@ -90,6 +107,25 @@ export async function postDocuments(request: HttpRequest, context: InvocationCon
90107
}
91108
}
92109

110+
async function removeDuplicateDocuments(store: FaissStore, filename: string): Promise<void> {
111+
const docstore = store.getDocstore();
112+
const mapping = store.getMapping();
113+
const idsToDelete: string[] = [];
114+
115+
// Find all document IDs that have the same filename
116+
for (const [vectorIndex, documentId] of Object.entries(mapping)) {
117+
const document = docstore.search(documentId);
118+
if (document && document.metadata?.source === filename) {
119+
idsToDelete.push(documentId);
120+
}
121+
}
122+
123+
// Delete the existing documents with the same filename
124+
if (idsToDelete.length > 0) {
125+
await store.delete({ ids: idsToDelete });
126+
}
127+
}
128+
93129
async function checkFolderExists(folderPath: string): Promise<boolean> {
94130
try {
95131
const stats = await fs.stat(folderPath);

0 commit comments

Comments
 (0)