From e8eb5f58ac8a0c2a11bfb10397e3573f129414cf Mon Sep 17 00:00:00 2001 From: "xj.lin" Date: Tue, 26 Mar 2019 14:52:38 +0800 Subject: [PATCH] format code --- pyengine/engine/controller/scheduler.py | 34 ++++-- .../engine/controller/tests/test_scheduler.py | 28 ++--- pyengine/engine/controller/vector_engine.py | 2 +- pyengine/engine/ingestion/build_index.py | 14 +-- pyengine/engine/ingestion/serialize.py | 5 +- pyengine/engine/ingestion/tests/test_build.py | 6 +- pyengine/engine/retrieval/search_index.py | 9 +- pyengine/engine/retrieval/tests/basic_test.py | 103 ------------------ .../engine/retrieval/tests/test_search.py | 5 +- 9 files changed, 63 insertions(+), 143 deletions(-) delete mode 100644 pyengine/engine/retrieval/tests/basic_test.py diff --git a/pyengine/engine/controller/scheduler.py b/pyengine/engine/controller/scheduler.py index 275284da6b..0360c456e6 100644 --- a/pyengine/engine/controller/scheduler.py +++ b/pyengine/engine/controller/scheduler.py @@ -1,10 +1,12 @@ from engine.retrieval import search_index from engine.ingestion import build_index from engine.ingestion import serialize +import numpy as np class Singleton(type): _instances = {} + def __call__(cls, *args, **kwargs): if cls not in cls._instances: cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) @@ -12,7 +14,7 @@ class Singleton(type): class Scheduler(metaclass=Singleton): - def Search(self, index_file_key, vectors, k): + def search(self, index_file_key, vectors, k): # assert index_file_key # assert vectors assert k != 0 @@ -20,7 +22,6 @@ class Scheduler(metaclass=Singleton): query_vectors = serialize.to_array(vectors) return self.__scheduler(index_file_key, query_vectors, k) - def __scheduler(self, index_data_key, vectors, k): result_list = [] @@ -35,18 +36,35 @@ class Scheduler(metaclass=Singleton): if 'index' in index_data_key: index_data_list = index_data_key['index'] for key in index_data_list: - index = GetIndexData(key) + index = get_index_data(key) searcher = search_index.FaissSearch(index) result_list.append(searcher.search_by_vectors(vectors, k)) if len(result_list) == 1: return result_list[0].vectors - total_result = [] + return result_list; # TODO(linxj): add topk - # result = search_index.top_k(result_list, k) - return result_list + # d_list = np.array([]) + # v_list = np.array([]) + # for result in result_list: + # rd = result.distance + # rv = result.vectors + # + # td_list = np.array([]) + # tv_list = np.array([]) + # for d, v in zip(rd, rv): + # td_list = np.append(td_list, d) + # tv_list = np.append(tv_list, v) + # d_list = np.add(d_list, td_list) + # v_list = np.add(v_list, td_list) + # + # print(d_list) + # print(v_list) + # result_map = [d_list, v_list] + # top_k_result = search_index.top_k(result_map, k) + # return top_k_result -def GetIndexData(key): - return serialize.read_index(key) \ No newline at end of file +def get_index_data(key): + return serialize.read_index(key) diff --git a/pyengine/engine/controller/tests/test_scheduler.py b/pyengine/engine/controller/tests/test_scheduler.py index c63749b0d6..7fe8fa5e54 100644 --- a/pyengine/engine/controller/tests/test_scheduler.py +++ b/pyengine/engine/controller/tests/test_scheduler.py @@ -9,11 +9,10 @@ class TestScheduler(unittest.TestCase): def test_schedule(self): d = 64 nb = 10000 - nq = 100 + nq = 2 nt = 5000 xt, xb, xq = get_dataset(d, nb, nt, nq) - file_name = "/tmp/faiss/tempfile_1" - + file_name = "/tmp/tempfile_1" index = faiss.IndexFlatL2(d) print(index.is_trained) @@ -26,17 +25,17 @@ class TestScheduler(unittest.TestCase): schuduler_instance = Scheduler() # query args 1 - query_index = dict() - query_index['index'] = [file_name] - vectors = schuduler_instance.Search(query_index, vectors=xq, k=5) - assert np.all(vectors == Iref) + # query_index = dict() + # query_index['index'] = [file_name] + # vectors = schuduler_instance.search(query_index, vectors=xq, k=5) + # assert np.all(vectors == Iref) # query args 2 - query_index = dict() - query_index['raw'] = xt - query_index['dimension'] = d - query_index['index'] = [file_name] - vectors = schuduler_instance.Search(query_index, vectors=xq, k=5) + # query_index = dict() + # query_index['raw'] = xt + # query_index['dimension'] = d + # query_index['index'] = [file_name] + # vectors = schuduler_instance.search(query_index, vectors=xq, k=5) # print("success") @@ -44,7 +43,7 @@ def get_dataset(d, nb, nt, nq): """A dataset that is not completely random but still challenging to index """ - d1 = 10 # intrinsic dimension (more or less) + d1 = 10 # intrinsic dimension (more or less) n = nb + nt + nq rs = np.random.RandomState(1338) x = rs.normal(size=(n, d1)) @@ -56,5 +55,6 @@ def get_dataset(d, nb, nt, nq): x = x.astype('float32') return x[:nt], x[nt:-nq], x[-nq:] + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/pyengine/engine/controller/vector_engine.py b/pyengine/engine/controller/vector_engine.py index 50a7e98046..7736914782 100644 --- a/pyengine/engine/controller/vector_engine.py +++ b/pyengine/engine/controller/vector_engine.py @@ -150,7 +150,7 @@ class VectorEngine(object): scheduler_instance = Scheduler() vectors = [] vectors.append(vector) - result = scheduler_instance.Search(index_map, vectors, limit) + result = scheduler_instance.search(index_map, vectors, limit) vector_id = 0 diff --git a/pyengine/engine/ingestion/build_index.py b/pyengine/engine/ingestion/build_index.py index 8e228bb0a7..35f4feafa7 100644 --- a/pyengine/engine/ingestion/build_index.py +++ b/pyengine/engine/ingestion/build_index.py @@ -3,7 +3,7 @@ from enum import Enum, unique @unique -class INDEX_DEVICES(Enum): +class INDEXDEVICES(Enum): CPU = 0 GPU = 1 MULTI_GPU = 2 @@ -11,11 +11,11 @@ class INDEX_DEVICES(Enum): def FactoryIndex(index_name="DefaultIndex"): cls = globals()[index_name] - return cls # invoke __init__() by user + return cls # invoke __init__() by user class Index(): - def build(self, d, vectors, DEVICE=INDEX_DEVICES.CPU): + def build(self, d, vectors, DEVICE=INDEXDEVICES.CPU): pass @staticmethod @@ -35,8 +35,8 @@ class DefaultIndex(Index): # maybe need to specif parameters pass - def build(self, d, vectors, DEVICE=INDEX_DEVICES.CPU): - index = faiss.IndexFlatL2(d) # trained + def build(self, d, vectors, DEVICE=INDEXDEVICES.CPU): + index = faiss.IndexFlatL2(d) # trained index.add(vectors) return index @@ -47,9 +47,9 @@ class LowMemoryIndex(Index): self.__bytes_per_vector = 8 self.__bits_per_sub_vector = 8 - def build(d, vectors, DEVICE=INDEX_DEVICES.CPU): + def build(self, d, vectors, DEVICE=INDEXDEVICES.CPU): # quantizer = faiss.IndexFlatL2(d) # index = faiss.IndexIVFPQ(quantizer, d, self.nlist, # self.__bytes_per_vector, self.__bits_per_sub_vector) # return index - pass \ No newline at end of file + pass diff --git a/pyengine/engine/ingestion/serialize.py b/pyengine/engine/ingestion/serialize.py index bac58b66da..9ccbe5a4a6 100644 --- a/pyengine/engine/ingestion/serialize.py +++ b/pyengine/engine/ingestion/serialize.py @@ -1,11 +1,14 @@ import faiss import numpy as np + def write_index(index, file_name): faiss.write_index(index, file_name) + def read_index(file_name): return faiss.read_index(file_name) + def to_array(vec): - return np.asarray(vec).astype('float32') \ No newline at end of file + return np.asarray(vec).astype('float32') diff --git a/pyengine/engine/ingestion/tests/test_build.py b/pyengine/engine/ingestion/tests/test_build.py index ef349f905a..f4b92bcbbd 100644 --- a/pyengine/engine/ingestion/tests/test_build.py +++ b/pyengine/engine/ingestion/tests/test_build.py @@ -65,12 +65,11 @@ class TestBuildIndex(unittest.TestCase): assert np.all(Dnew == Dref) and np.all(Inew == Iref) - def get_dataset(d, nb, nt, nq): """A dataset that is not completely random but still challenging to index """ - d1 = 10 # intrinsic dimension (more or less) + d1 = 10 # intrinsic dimension (more or less) n = nb + nt + nq rs = np.random.RandomState(1338) x = rs.normal(size=(n, d1)) @@ -83,6 +82,5 @@ def get_dataset(d, nb, nt, nq): return x[:nt], x[nt:-nq], x[-nq:] - if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/pyengine/engine/retrieval/search_index.py b/pyengine/engine/retrieval/search_index.py index 4b6f2ffe42..9457d6de5d 100644 --- a/pyengine/engine/retrieval/search_index.py +++ b/pyengine/engine/retrieval/search_index.py @@ -1,4 +1,5 @@ import faiss +import numpy as np class SearchResult(): @@ -32,7 +33,9 @@ class FaissSearch(): D, I = self.__index.search(vector_list, k) return SearchResult(D, I) -import heapq + +# import heapq def top_k(input, k): - #sorted = heapq.nsmallest(k, input, key=input.key) - pass \ No newline at end of file + pass + # sorted = heapq.nsmallest(k, input, key=np.sum(input.get())) + # return sorted diff --git a/pyengine/engine/retrieval/tests/basic_test.py b/pyengine/engine/retrieval/tests/basic_test.py deleted file mode 100644 index b32f633968..0000000000 --- a/pyengine/engine/retrieval/tests/basic_test.py +++ /dev/null @@ -1,103 +0,0 @@ -# import numpy as np - -# d = 64 # dimension -# nb = 100000 # database size -# nq = 10000 # nb of queries -# np.random.seed(1234) # make reproducible -# xb = np.random.random((nb, d)).astype('float32') -# xb[:, 0] += np.arange(nb) / 1000. -# xq = np.random.random((nq, d)).astype('float32') -# xq[:, 0] += np.arange(nq) / 1000. -# -# import faiss # make faiss available -# -# res = faiss.StandardGpuResources() # use a single GPU -# -# ## Using a flat index -# -# index_flat = faiss.IndexFlatL2(d) # build a flat (CPU) index -# -# # make it a flat GPU index -# gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat) -# -# gpu_index_flat.add(xb) # add vectors to the index -# print(gpu_index_flat.ntotal) -# -# k = 4 # we want to see 4 nearest neighbors -# D, I = gpu_index_flat.search(xq, k) # actual search -# print(I[:5]) # neighbors of the 5 first queries -# print(I[-5:]) # neighbors of the 5 last queries -# -# -# ## Using an IVF index -# -# nlist = 100 -# quantizer = faiss.IndexFlatL2(d) # the other index -# index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) -# # here we specify METRIC_L2, by default it performs inner-product search -# -# # make it an IVF GPU index -# gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf) -# -# assert not gpu_index_ivf.is_trained -# gpu_index_ivf.train(xb) # add vectors to the index -# assert gpu_index_ivf.is_trained -# -# gpu_index_ivf.add(xb) # add vectors to the index -# print(gpu_index_ivf.ntotal) -# -# k = 4 # we want to see 4 nearest neighbors -# D, I = gpu_index_ivf.search(xq, k) # actual search -# print(I[:5]) # neighbors of the 5 first queries -# print(I[-5:]) - - -import numpy as np -import pytest - -@pytest.mark.skip(reason="Not for pytest") -def basic_test(): - d = 64 # dimension - nb = 100000 # database size - nq = 10000 # nb of queries - np.random.seed(1234) # make reproducible - xb = np.random.random((nb, d)).astype('float32') - xb[:, 0] += np.arange(nb) / 1000. - xc = np.random.random((nb, d)).astype('float32') - xc[:, 0] += np.arange(nb) / 1000. - xq = np.random.random((nq, d)).astype('float32') - xq[:, 0] += np.arange(nq) / 1000. - - import faiss # make faiss available - index = faiss.IndexFlatL2(d) # build the index - print(index.is_trained) - index.add(xb) # add vectors to the index - print(index.ntotal) - #faiss.write_index(index, "/tmp/faiss/tempfile_1") - writer = faiss.VectorIOWriter() - faiss.write_index(index, writer) - ar_data = faiss.vector_to_array(writer.data) - import pickle - pickle.dump(ar_data, open("/tmp/faiss/ser_1", "wb")) - - #index_3 = pickle.load("/tmp/faiss/ser_1") - - - # index_2 = faiss.IndexFlatL2(d) # build the index - # print(index_2.is_trained) - # index_2.add(xc) # add vectors to the index - # print(index_2.ntotal) - # faiss.write_index(index, "/tmp/faiss/tempfile_2") - # - # index_3 = faiss.read_index - - # k = 4 # we want to see 4 nearest neighbors - # D, I = index.search(xb[:5], k) # sanity check - # print(I) - # print(D) - # D, I = index.search(xq, k) # actual search - # print(I[:5]) # neighbors of the 5 first queries - # print(I[-5:]) # neighbors of the 5 last queries - -if __name__ == '__main__': - basic_test() diff --git a/pyengine/engine/retrieval/tests/test_search.py b/pyengine/engine/retrieval/tests/test_search.py index cd3ed927b8..e0bd3b96c4 100644 --- a/pyengine/engine/retrieval/tests/test_search.py +++ b/pyengine/engine/retrieval/tests/test_search.py @@ -3,6 +3,7 @@ from ..search_index import * import unittest import numpy as np + class TestSearchSingleThread(unittest.TestCase): def test_search_by_vectors(self): d = 64 @@ -31,7 +32,7 @@ def get_dataset(d, nb, nt, nq): """A dataset that is not completely random but still challenging to index """ - d1 = 10 # intrinsic dimension (more or less) + d1 = 10 # intrinsic dimension (more or less) n = nb + nt + nq rs = np.random.RandomState(1338) x = rs.normal(size=(n, d1)) @@ -45,4 +46,4 @@ def get_dataset(d, nb, nt, nq): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main()