Skip to content

Commit 35f8cf7

Browse files
authored
Fixing index_prefix_path bug in python for StaticMemoryIndex (#491)
* Fixing the same bug I had in static disk index inside of static memory index as well. * Unit tests and a better understanding of why the unit tests were successful despite this bug
1 parent 4a57e89 commit 35f8cf7

File tree

5 files changed

+68
-10
lines changed

5 files changed

+68
-10
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta"
1111

1212
[project]
1313
name = "diskannpy"
14-
version = "0.7.0rc1"
14+
version = "0.7.0rc2"
1515

1616
description = "DiskANN Python extension module"
1717
readme = "python/README.md"

python/src/_static_disk_index.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,9 @@ def __init__(
7979
does not exist, you are required to provide it.
8080
- **index_prefix**: The prefix of the index files. Defaults to "ann".
8181
"""
82-
index_prefix = _valid_index_prefix(index_directory, index_prefix)
82+
index_prefix_path = _valid_index_prefix(index_directory, index_prefix)
8383
vector_dtype, metric, _, _ = _ensure_index_metadata(
84-
index_prefix,
84+
index_prefix_path,
8585
vector_dtype,
8686
distance_metric,
8787
1, # it doesn't matter because we don't need it in this context anyway
@@ -101,7 +101,7 @@ def __init__(
101101
_index = _native_dap.StaticDiskFloatIndex
102102
self._index = _index(
103103
distance_metric=dap_metric,
104-
index_path_prefix=index_prefix,
104+
index_path_prefix=index_prefix_path,
105105
num_threads=num_threads,
106106
num_nodes_to_cache=num_nodes_to_cache,
107107
cache_mechanism=cache_mechanism,

python/src/_static_memory_index.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,22 +77,22 @@ def __init__(
7777
does not exist, you are required to provide it.
7878
- **enable_filters**: Indexes built with filters can also be used for filtered search.
7979
"""
80-
index_prefix = _valid_index_prefix(index_directory, index_prefix)
80+
index_prefix_path = _valid_index_prefix(index_directory, index_prefix)
8181
self._labels_map = {}
8282
self._labels_metadata = {}
8383
if enable_filters:
8484
try:
85-
with open(index_prefix + "_labels_map.txt", "r") as labels_map_if:
85+
with open(f"{index_prefix_path}_labels_map.txt", "r") as labels_map_if:
8686
for line in labels_map_if:
8787
(key, val) = line.split("\t")
8888
self._labels_map[key] = int(val)
89-
with open(f"{index_prefix}_label_metadata.json", "r") as labels_metadata_if:
89+
with open(f"{index_prefix_path}_label_metadata.json", "r") as labels_metadata_if:
9090
self._labels_metadata = json.load(labels_metadata_if)
9191
except: # noqa: E722
9292
# exceptions are basically presumed to be either file not found or file not formatted correctly
9393
raise RuntimeException("Filter labels file was unable to be processed.")
9494
vector_dtype, metric, num_points, dims = _ensure_index_metadata(
95-
index_prefix,
95+
index_prefix_path,
9696
vector_dtype,
9797
distance_metric,
9898
1, # it doesn't matter because we don't need it in this context anyway
@@ -119,7 +119,7 @@ def __init__(
119119
distance_metric=dap_metric,
120120
num_points=num_points,
121121
dimensions=dims,
122-
index_path=os.path.join(index_directory, index_prefix),
122+
index_path=index_prefix_path,
123123
num_threads=num_threads,
124124
initial_search_complexity=initial_search_complexity,
125125
)

python/tests/test_static_disk_index.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import shutil
55
import unittest
6+
from pathlib import Path
67
from tempfile import mkdtemp
78

89
import diskannpy as dap
@@ -165,4 +166,33 @@ def test_zero_threads(self):
165166
k = 5
166167
ids, dists = index.batch_search(
167168
query_vectors, k_neighbors=k, complexity=5, beam_width=2, num_threads=0
168-
)
169+
)
170+
171+
def test_relative_paths(self):
172+
# Issue 483 and 491 both fixed errors that were somehow slipping past our unit tests
173+
# os.path.join() acts as a semi-merge if you give it two paths that look absolute.
174+
# since our unit tests are using absolute paths via tempfile.mkdtemp(), the double os.path.join() was never
175+
# caught by our tests, but was very easy to trip when using relative paths
176+
rel_dir = "tmp"
177+
Path(rel_dir).mkdir(exist_ok=True)
178+
try:
179+
tiny_index_vecs = random_vectors(20, 10, dtype=np.float32, seed=12345)
180+
dap.build_disk_index(
181+
data=tiny_index_vecs,
182+
distance_metric="l2",
183+
index_directory=rel_dir,
184+
graph_degree=16,
185+
complexity=32,
186+
search_memory_maximum=0.00003,
187+
build_memory_maximum=1,
188+
num_threads=0,
189+
pq_disk_bytes=0,
190+
)
191+
index = dap.StaticDiskIndex(
192+
index_directory=rel_dir,
193+
num_threads=16,
194+
num_nodes_to_cache=10,
195+
)
196+
197+
finally:
198+
shutil.rmtree(rel_dir, ignore_errors=True)

python/tests/test_static_memory_index.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import shutil
66
import unittest
77

8+
from pathlib import Path
89
from tempfile import mkdtemp
910

1011
import diskannpy as dap
@@ -191,6 +192,33 @@ def test_zero_threads(self):
191192
k = 5
192193
ids, dists = index.batch_search(query_vectors, k_neighbors=k, complexity=5, num_threads=0)
193194

195+
def test_relative_paths(self):
196+
# Issue 483 and 491 both fixed errors that were somehow slipping past our unit tests
197+
# os.path.join() acts as a semi-merge if you give it two paths that look absolute.
198+
# since our unit tests are using absolute paths via tempfile.mkdtemp(), the double os.path.join() was never
199+
# caught by our tests, but was very easy to trip when using relative paths
200+
rel_dir = "tmp"
201+
Path(rel_dir).mkdir(exist_ok=True)
202+
try:
203+
tiny_index_vecs = random_vectors(20, 10, dtype=np.float32, seed=12345)
204+
dap.build_memory_index(
205+
data=tiny_index_vecs,
206+
distance_metric="l2",
207+
index_directory=rel_dir,
208+
graph_degree=16,
209+
complexity=32,
210+
num_threads=0,
211+
)
212+
index = dap.StaticMemoryIndex(
213+
index_directory=rel_dir,
214+
num_threads=0,
215+
initial_search_complexity=32,
216+
)
217+
218+
finally:
219+
shutil.rmtree(rel_dir, ignore_errors=True)
220+
221+
194222

195223
class TestFilteredStaticMemoryIndex(unittest.TestCase):
196224
def test_simple_scenario(self):

0 commit comments

Comments
 (0)