diff --git a/LICENSE b/LICENSE index 25397f1..1e93740 100755 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) Beijing Wodong Tianjun Information Technology Co., Ltd. The Gamma Authors. +Copyright (c) The Gamma Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 864fadb..9415dd3 --- a/README.md +++ b/README.md @@ -1,56 +1,56 @@ -# Gamma Python SDK - -gamma python sdk and python wheel packages. - -## Overview - -This repository shows gamma python sdk and provides scripts to create wheel -packages for the gamma library. - -[python sdk api](./docs/APIPythonSDK.md) is the document of python sdk api. -Files in directory of python shows how the python sdk encapsulate gamma. -setup.py is written for creating wheel packages for gamma. - -Of course, pip install vearch is the easiest way to use this python sdk. And -this repository helps to build your custom python sdk. - -## Building source package - -if thers is a custom built gamma library in the system, build source package -for the best performance. - -### Prerequisite - -You can build it with docker image: pypywheels/manylinux2010-pypy_x86_64:latest - -auditwheel tool should be installed firstly. You can install it by pip. - -The package can be built when gamma is already built and installed. -See the official [gamma installation -instruction](https://github.com/vearch/gamma/blob/master/README.md) for more -on how to build and install gamma. In particular, compiling wheel packages -requires additional compilation options in compiling gamma. - -```bash -git clone https://github.com/vearch/vearch-python.git -git submodule init -git submodule update -cd gamma -mkdir build && cd build -cmake -DCMAKE_BUILD_TYPE=Release -DPERFORMANCE_TESTING=ON .. -make -sh build-wheels.sh -sh install-vearch.sh -``` - -Then the whl file will be generated into the wheelhouse directory. - -For building wheel packages, swig 3.0.12 or later needs to be avaiable. - -### Linux - -In linux, `auditwheel` is used for creating python wheel packages ocntains -precompiled binary extensions. -Header locations and link flags can be customized by `GAMMA_INCLUDE` and -`GAMMA_LDFLAGS` environment variables for building wheel packages. -Windows and OSX are not supported yet. +# Gamma Python SDK + +gamma python sdk and python wheel packages. + +## Overview + +This repository shows gamma python sdk and provides scripts to create wheel +packages for the gamma library. + +[python sdk api](./docs/APIPythonSDK.md) is the document of python sdk api. +Files in directory of python shows how the python sdk encapsulate gamma. +setup.py is written for creating wheel packages for gamma. + +Of course, pip install vearch is the easiest way to use this python sdk. And +this repository helps to build your custom python sdk. + +## Building source package + +if thers is a custom built gamma library in the system, build source package +for the best performance. + +### Prerequisite + +You can build it with docker image: pypywheels/manylinux2010-pypy_x86_64:latest + +auditwheel tool should be installed firstly. You can install it by pip. + +The package can be built when gamma is already built and installed. +See the official [gamma installation +instruction](https://github.com/vearch/gamma/blob/master/README.md) for more +on how to build and install gamma. In particular, compiling wheel packages +requires additional compilation options in compiling gamma. + +```bash +git clone https://github.com/vearch/vearch-python.git +git submodule init +git submodule update +cd gamma +mkdir build && cd build +cmake -DCMAKE_BUILD_TYPE=Release -DPERFORMANCE_TESTING=ON .. +make +sh build-wheels.sh +sh install-vearch.sh +``` + +Then the whl file will be generated into the wheelhouse directory. + +For building wheel packages, swig 3.0.12 or later needs to be avaiable. + +### Linux + +In linux, `auditwheel` is used for creating python wheel packages ocntains +precompiled binary extensions. +Header locations and link flags can be customized by `GAMMA_INCLUDE` and +`GAMMA_LDFLAGS` environment variables for building wheel packages. +Windows and OSX are not supported yet. diff --git a/build-wheels.sh b/build-wheels.sh index fafb719..66268cc 100755 --- a/build-wheels.sh +++ b/build-wheels.sh @@ -25,11 +25,13 @@ elif [ `expr substr ${OS} 1 5` == "Linux" ];then export GAMMA_LDFLAGS=$BASE_PATH/build/libgamma.so export GAMMA_INCLUDE=$BASE_PATH export LD_LIBRARY_PATH=$BASE_PATH/build/:$LD_LIBRARY_PATH - for PYBIN in /opt/python/cp38-cp38/bin; do - "${PYBIN}/pip" install -r dev-requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ - "${PYBIN}/python" setup.py bdist_wheel - auditwheel repair dist/vearch* - rm -rf dist build vearch.egg-info + for PYBIN in /opt/python/*/bin; do + if [[ ${PYBIN} =~ "cp" ]]; then + "${PYBIN}/pip" install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ + "${PYBIN}/python" setup.py bdist_wheel + auditwheel repair dist/vearch* + rm -rf dist build vearch.egg-info + fi done elif [ `expr substr ${OS} 1 10` == "MINGW" ];then echo "windows not support" diff --git a/demos/demo_scann_module.py b/demos/demo_scann_module.py new file mode 100644 index 0000000..0745cda --- /dev/null +++ b/demos/demo_scann_module.py @@ -0,0 +1,41 @@ +import vearch +import time +import numpy as np +print("create table") +engine = vearch.Engine("files", "logs") +table = { + "name" : "test_table", + "engine" : { + "index_size": 50000, + "retrieval_type": "VEARCH", + "retrieval_param": { + "metric_type": "InnerProduct", + "ncentroids": 512, + "nsubvector": 256, + "reordering": True + } + }, + "properties" : { + "feature": { + "type": "vector", + "dimension": 512, + "store_type": "Mmap" + } + } +} +engine.create_table(table) +print("add data") +add_num = 100000 +X = np.random.rand(add_num, 512).astype('float32') +engine.add2(X) +print("search") +nprobe, rerank, query_num= 20, 100, 10 +engine.set_nprobe(nprobe) +engine.set_rerank(rerank) +Q = np.random.rand(query_num, 512).astype('float32') +indexed_num = 0 +while indexed_num != X.shape[0]: + indexed_num = engine.get_status()['min_indexed_num'] + time.sleep(1) +engine.search2(Q, query_num) + diff --git a/docs/APIPythonSDK.md b/docs/APIPythonSDK.md old mode 100755 new mode 100644 diff --git a/gamma b/gamma deleted file mode 160000 index 7dd6231..0000000 --- a/gamma +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7dd6231f01e571627450e649bb439fb881bb1098 diff --git a/install-vearch.sh b/install-vearch.sh index 7a2fd13..2490bd1 100755 --- a/install-vearch.sh +++ b/install-vearch.sh @@ -17,11 +17,11 @@ if [ ${OS} == "Darwin" ];then pip install ${WHEEL} done elif [ `expr substr ${OS} 1 5` == "Linux" ];then - for PYBIN in /opt/python/cp38-cp38/bin; do + for PYBIN in /opt/python/*/bin; do python_tag=$(echo ${PYBIN} | cut -d '/' -f4) "${PYBIN}/pip" uninstall vearch --yes - "${PYBIN}/pip" install "wheelhouse/vearch-$version-${python_tag}-manylinux_2_12_x86_64.manylinux2010_x86_64.whl" - "${PYBIN}/python" -c "import vearch" + "${PYBIN}/pip" install "wheelhouse/vearch-${version}.3-${python_tag}-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" + "${PYBIN}/python" -c "import vearch" done elif [];then echo "Windows not support!!!" diff --git a/python/__init__.py b/python/__init__.py old mode 100755 new mode 100644 index 0216d96..269a1db --- a/python/__init__.py +++ b/python/__init__.py @@ -10,11 +10,6 @@ 2. vector similarity search 3. use like a database -Vearch have four builtins.object - Engine - EngineTable - Item - Query use help(vearch.Object) to get more detail infomations. """ import time @@ -566,6 +561,15 @@ def create_item(self, table, doc_id, doc_info): self.doc.SetKey(doc_id) return doc_info['_id'] + def create_items(self, table, doc_ids, docs): + if len(table.vec_infos) != 1 or len(table.field_infos) != 1: + return 1 + for doc_id in doc_ids: + doc = Doc() + doc.SetKey(doc_id) + docs.AddDoc(doc) + return 0 + def set_doc(self): for field in self.fields: if field.type == dataType.VECTOR: @@ -1120,7 +1124,6 @@ def deserialize(self, table, buf): query_results.append(query_result) self.query_results = query_results - class Engine: ''' vearch core It is used to store, update and delete feature vectors, @@ -1203,6 +1206,38 @@ def add(self, docs_info): print("finish add cost %.4f s" % (time.time() - start)) return doc_ids + def add2(self, data): + ''' add docs into table + data: raw vector + return: unique docs' id for docs + ''' + if self.verbose: + start = time.time() + if not isinstance(data, np.ndarray): + ex = Exception('The add function takes an incorrect argument; it must be of a list type.') + raise ex + nb, d = data.shape + doc_ids = [self.create_id() for i in range(nb)] + docs = Docs() + doc = GammaDoc() + if doc.create_items(self.gamma_table, doc_ids, docs): + return [] + results = swigCreateBatchResult(nb) + if self.verbose: + print("prepare add cost %.4f s" % (time.time() - start)) + start = time.time() + if 0 == swigAddOrUpdateDocsCPP2(self.c_engine, docs, swig_ptr(data), results): + if self.verbose: + print("gamma add cost %.4f s" % (time.time() - start)) + start = time.time() + for i in range(nb): + if results.Code(i) == 0: + self.total_added_num += 1 + swigDeleteBatchResult(results) + if self.verbose: + print("finish add cost %.4f s" % (time.time() - start)) + return doc_ids + def update_doc(self, doc_info, doc_id): ''' update doc's info. The docs_info must contain "_id" information. doc_info: doc's new info. @@ -1330,6 +1365,45 @@ def search(self, query_info): if self.verbose: print("get results cost %f ms" %((time.time() - start) * 1000)) return results + + def set_nprobe(self, nprobe): + swigSetNprobe(self.c_engine, nprobe, self.gamma_table.engine['retrieval_type']) + + def set_rerank(self, rerank): + swigSetRerank(self.c_engine, rerank, self.gamma_table.engine['retrieval_type']) + + def search2(self, xq, k): + ''' search in table + xq: query data + ''' + #if self.verbose: + # start = time.time() + if not isinstance(xq, np.ndarray): + ex = Exception('The search2 function takes an incorrect argument; it must be of a list type.') + raise ex + # d should also check, TODO + if len(xq.shape) > 1: + n, d = xq.shape + elif len(xq.shape) == 1: + n = 1 + d = xq.shape[0] + else: + return () + distances = np.empty((n, k), dtype=np.float32) + labels = np.empty((n, k), dtype=np.int64) + result = swigCreateVectorResult(n, k, swig_ptr(distances), swig_ptr(labels)) + result.query = swig_ptr(xq) + #if self.verbose: + # print("prepare search cost %f ms" %((time.time() - start) * 1000)) + # start = time.time() + ret = swigSearchCPP2(self.c_engine, result) + swigDeleteVectorResult(result) + #if self.verbose: + # print("gamma search cost %f ms" %((time.time() - start) * 1000)) + if ret: + return () + else: + return distances, labels def del_doc_by_query(self, query_info): ''' delete docs by query diff --git a/python/swigvearch.i b/python/swigvearch.i index 925c6a5..837d0d0 100755 --- a/python/swigvearch.i +++ b/python/swigvearch.i @@ -152,7 +152,7 @@ typedef int64_t size_t; return vec_res; } } - + tig_gamma::Request *swigCreateRequest() { return new tig_gamma::Request(); } @@ -163,7 +163,30 @@ typedef int64_t size_t; request = nullptr; } } - + + void swigSetNprobe(void *engine, int nprobe, std::string index_type) { + CPPSetNprobe(engine, nprobe, index_type); + } + + void swigSetRerank(void *engine, int rerank, std::string index_type) { + CPPSetRerank(engine, rerank, index_type); + } + + tig_gamma::VectorResult *swigCreateVectorResult(int n, int k, float *dists, int64_t *labels) { + tig_gamma::VectorResult *result = new tig_gamma::VectorResult(); + result->init(n, k, dists, labels); + return result; + } + + void swigDeleteVectorResult(tig_gamma::VectorResult *result) { + if (result) { + result->dists = nullptr; + result->docids = nullptr; + delete result; + result = nullptr; + } + } + tig_gamma::Response *swigCreateResponse() { return new tig_gamma::Response(); } @@ -264,6 +287,10 @@ typedef int64_t size_t; return CPPSearch(engine, request, response); } + int swigSearchCPP2(void* engine, tig_gamma::VectorResult *result) { + return CPPSearch2(engine, result); + } + int swigAddOrUpdateDocCPP(void* engine, tig_gamma::Doc *doc) { return CPPAddOrUpdateDoc(engine, doc); } @@ -283,6 +310,10 @@ typedef int64_t size_t; return CPPAddOrUpdateDocs(engine, docs, results); } + int swigAddOrUpdateDocsCPP2(void* engine, tig_gamma::Docs *docs, float *data, tig_gamma::BatchResult *results) { + return CPPAddOrUpdateDocs2(engine, docs, data, results); + } + int swigDelDocByQuery(void* engine, unsigned char *pRequest, int len){ char* request_str = (char*)pRequest; return DelDocByQuery(engine, request_str, len); diff --git a/dev-requirements.txt b/requirements.txt old mode 100755 new mode 100644 similarity index 57% rename from dev-requirements.txt rename to requirements.txt index b0d6c00..312f1c5 --- a/dev-requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ -numpy flatbuffers==1.12.0 -delocate +numpy>=1.16.0 diff --git a/setup.py b/setup.py index 8a23fe9..73fe346 100755 --- a/setup.py +++ b/setup.py @@ -5,6 +5,7 @@ from distutils.util import get_platform import os import sys +import pathlib long_description=""" Vearch is the vector search infrastructure for deeping learning and AI applications. @@ -85,7 +86,7 @@ def _remove_flag(self, flag): setup( name='vearch', - version='3.2.8', + version='3.2.8.3', description='A library for efficient similarity search and storage of deep learning vectors.', long_description=long_description, url='https://github.com/vearch/vearch', @@ -98,7 +99,7 @@ def _remove_flag(self, flag): 'build': CustomBuild, 'build_ext': CustomBuildExt, }, - install_requires=['numpy>=1.16.0', 'flatbuffers==1.12.0'], + install_requires=pathlib.Path("requirements.txt").read_text().splitlines(), package_dir={'vearch': 'python','vearch/gamma_api': 'python/gamma_api'}, packages=['vearch','vearch.gamma_api'], ext_modules=[_swigvearch]