Skip to content

Commit 44445de

Browse files
authored
DynamicMemoryIndex bug fixes (#404)
* While simply creating a unit test to repro Issue #400, I found a number of bugs that I needed to address just to get it to work the way I had intended. This does not yet have what I would consider a comprehensive suite of test coverage for the DynamicMemoryIndex, but we at least do save it with the metadata file, we can load it correctly, and saving *always* consolidate_deletes() prior to save if any item has been marked for deletion prior to save. * We actually cannot save without compacting before save anyway. Removing the parameter from save() and hardcoding it to True until we can actually support it. * Addressing some PR comments and readying a 0.5.0.rc5 release
1 parent e1a8d78 commit 44445de

File tree

6 files changed

+79
-6
lines changed

6 files changed

+79
-6
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta"
1111

1212
[project]
1313
name = "diskannpy"
14-
version = "0.5.0.rc4"
14+
version = "0.5.0.rc5"
1515

1616
description = "DiskANN Python extension module"
1717
readme = "python/README.md"

python/include/dynamic_memory_index.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ class DynamicMemoryIndex
4141
uint64_t num_queries, uint64_t knn, uint64_t complexity,
4242
uint32_t num_threads);
4343
void consolidate_delete();
44+
size_t num_points();
45+
4446

4547
private:
4648
const uint32_t _initial_search_complexity;

python/src/_dynamic_memory_index.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright (c) Microsoft Corporation. All rights reserved.
22
# Licensed under the MIT license.
33

4+
import os
45
import warnings
56

67
import numpy as np
@@ -21,12 +22,14 @@
2122
_assert,
2223
_assert_2d,
2324
_assert_dtype,
25+
_assert_existing_directory,
2426
_assert_is_nonnegative_uint32,
2527
_assert_is_positive_uint32,
2628
_castable_dtype_or_raise,
2729
_ensure_index_metadata,
2830
_valid_metric,
2931
_valid_index_prefix,
32+
_write_index_metadata
3033
)
3134
from ._diskannpy import defaults
3235

@@ -158,6 +161,7 @@ def __init__(
158161
"""
159162

160163
dap_metric = _valid_metric(distance_metric)
164+
self._dap_metric = dap_metric
161165
_assert_dtype(vector_dtype)
162166
_assert_is_positive_uint32(dimensions, "dimensions")
163167

@@ -199,6 +203,7 @@ def __init__(
199203
search_threads=search_threads,
200204
concurrent_consolidation=concurrent_consolidation
201205
)
206+
self._points_deleted = False
202207

203208
def search(
204209
self, query: VectorLike, k_neighbors: int, complexity: int
@@ -293,16 +298,31 @@ def batch_search(
293298
num_threads=num_threads,
294299
)
295300

296-
def save(self, save_path: str, compact_before_save: bool = True):
301+
def save(self, save_path: str, index_prefix: str = "ann"):
297302
"""
298303
Saves this index to file.
299304
:param save_path: The path to save these index files to.
300305
:type save_path: str
301-
:param compact_before_save:
306+
:param index_prefix: The prefix to use for the index files. Default is "ann".
307+
:type index_prefix: str
302308
"""
303309
if save_path == "":
304310
raise ValueError("save_path cannot be empty")
305-
self._index.save(save_path=save_path, compact_before_save=compact_before_save)
311+
if index_prefix == "":
312+
raise ValueError("index_prefix cannot be empty")
313+
_assert_existing_directory(save_path, "save_path")
314+
save_path = os.path.join(save_path, index_prefix)
315+
if self._points_deleted is True:
316+
warnings.warn(
317+
"DynamicMemoryIndex.save() currently requires DynamicMemoryIndex.consolidate_delete() to be called "
318+
"prior to save when items have been marked for deletion. This is being done automatically now, though"
319+
"it will increase the time it takes to save; on large sets of data it can take a substantial amount of "
320+
"time. In the future, we will implement a faster save with unconsolidated deletes, but for now this is "
321+
"required."
322+
)
323+
self._index.consolidate_delete()
324+
self._index.save(save_path=save_path, compact_before_save=True) # we do not yet support uncompacted saves
325+
_write_index_metadata(save_path, self._vector_dtype, self._dap_metric, self._index.num_points(), self._dimensions)
306326

307327
def insert(self, vector: VectorLike, vector_id: VectorIdentifier):
308328
"""
@@ -349,10 +369,12 @@ def mark_deleted(self, vector_id: VectorIdentifier):
349369
:type vector_id: int
350370
"""
351371
_assert_is_positive_uint32(vector_id, "vector_id")
372+
self._points_deleted = True
352373
self._index.mark_deleted(np.uintc(vector_id))
353374

354375
def consolidate_delete(self):
355376
"""
356377
This method actually restructures the DiskANN index to remove the items that have been marked for deletion.
357378
"""
358379
self._index.consolidate_delete()
380+
self._points_deleted = False

python/src/dynamic_memory_index.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,11 @@ template <class DT> void DynamicMemoryIndex<DT>::consolidate_delete()
159159
_index.consolidate_deletes(_write_parameters);
160160
}
161161

162+
template <class DT> size_t DynamicMemoryIndex<DT>::num_points()
163+
{
164+
return _index.get_num_points();
165+
}
166+
162167
template class DynamicMemoryIndex<float>;
163168
template class DynamicMemoryIndex<uint8_t>;
164169
template class DynamicMemoryIndex<int8_t>;

python/src/module.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ template <typename T> inline void add_variant(py::module_ &m, const Variant &var
7979
.def("save", &diskannpy::DynamicMemoryIndex<T>::save, "save_path"_a = "", "compact_before_save"_a = false)
8080
.def("insert", &diskannpy::DynamicMemoryIndex<T>::insert, "vector"_a, "id"_a)
8181
.def("mark_deleted", &diskannpy::DynamicMemoryIndex<T>::mark_deleted, "id"_a)
82-
.def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete);
82+
.def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete)
83+
.def("num_points", &diskannpy::DynamicMemoryIndex<T>::num_points);
8384

8485
py::class_<diskannpy::StaticDiskIndex<T>>(m, variant.static_disk_index_name.c_str())
8586
.def(py::init<const diskann::Metric, const std::string &, const uint32_t, const size_t, const uint32_t>(),

python/tests/test_dynamic_memory_index.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Copyright (c) Microsoft Corporation. All rights reserved.
22
# Licensed under the MIT license.
33

4-
import os
54
import shutil
5+
import tempfile
66
import unittest
77

88
import diskannpy as dap
@@ -296,3 +296,46 @@ def test_value_ranges_batch_search(self):
296296
index.batch_search(
297297
queries=np.array([[]], dtype=np.single), **kwargs
298298
)
299+
300+
# Issue #400
301+
def test_issue400(self):
302+
_, _, _, index_vectors, ann_dir, _, generated_tags = self._test_matrix[0]
303+
304+
deletion_tag = generated_tags[10] # arbitrary choice
305+
deletion_vector = index_vectors[10]
306+
307+
index = dap.DynamicMemoryIndex.from_file(
308+
index_directory=ann_dir,
309+
num_threads=16,
310+
initial_search_complexity=32,
311+
max_vectors=10100,
312+
complexity=64,
313+
graph_degree=32
314+
)
315+
index.insert(np.array([1.0] * 10, dtype=np.single), 10099)
316+
index.insert(np.array([2.0] * 10, dtype=np.single), 10050)
317+
index.insert(np.array([3.0] * 10, dtype=np.single), 10053)
318+
tags, distances = index.search(np.array([3.0] * 10, dtype=np.single), k_neighbors=5, complexity=64)
319+
self.assertIn(10053, tags)
320+
tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64)
321+
self.assertIn(deletion_tag, tags, "deletion_tag should exist, as we have not deleted yet")
322+
index.mark_deleted(deletion_tag)
323+
tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64)
324+
self.assertNotIn(deletion_tag, tags, "deletion_tag should not exist, as we have marked it for deletion")
325+
with tempfile.TemporaryDirectory() as tmpdir:
326+
index.save(tmpdir)
327+
328+
index2 = dap.DynamicMemoryIndex.from_file(
329+
index_directory=tmpdir,
330+
num_threads=16,
331+
initial_search_complexity=32,
332+
max_vectors=10100,
333+
complexity=64,
334+
graph_degree=32
335+
)
336+
tags, distances = index2.search(deletion_vector, k_neighbors=5, complexity=64)
337+
self.assertNotIn(
338+
deletion_tag,
339+
tags,
340+
"deletion_tag should not exist, as we saved and reloaded the index without it"
341+
)

0 commit comments

Comments
 (0)