From d37f89dc3d1f9c2901421c2a9476d18405602c05 Mon Sep 17 00:00:00 2001 From: "xj.lin" Date: Sun, 24 Mar 2019 19:45:15 +0800 Subject: [PATCH 1/8] add some test --- pyengine/engine/ingestion/build_index.py | 2 +- pyengine/engine/ingestion/tests/__init__.py | 0 pyengine/engine/ingestion/tests/test_build.py | 63 +++++++++++++++++++ pyengine/engine/retrieval/tests/basic_test.py | 3 +- 4 files changed, 65 insertions(+), 3 deletions(-) create mode 100644 pyengine/engine/ingestion/tests/__init__.py create mode 100644 pyengine/engine/ingestion/tests/test_build.py diff --git a/pyengine/engine/ingestion/build_index.py b/pyengine/engine/ingestion/build_index.py index ebb0749927..68db892dd4 100644 --- a/pyengine/engine/ingestion/build_index.py +++ b/pyengine/engine/ingestion/build_index.py @@ -35,7 +35,7 @@ class DefaultIndex(Index): # maybe need to specif parameters pass - def build(d, vectors, DEVICE=INDEX_DEVICES.CPU): + def build(self, d, vectors, DEVICE=INDEX_DEVICES.CPU): index = faiss.IndexFlatL2(d) # trained index.add(vectors) return index diff --git a/pyengine/engine/ingestion/tests/__init__.py b/pyengine/engine/ingestion/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/pyengine/engine/ingestion/tests/test_build.py b/pyengine/engine/ingestion/tests/test_build.py new file mode 100644 index 0000000000..4dc90e95b1 --- /dev/null +++ b/pyengine/engine/ingestion/tests/test_build.py @@ -0,0 +1,63 @@ +from ..build_index import * + +import faiss +import numpy as np +import unittest + + +class TestBuildIndex(unittest.TestCase): + def test_factory_method(self): + pass + + def test_default_index(self): + d = 64 + nb = 10000 + nq = 100 + _, xb, xq = get_dataset(d, nb, 500, nq) + + # Expected result + index = faiss.IndexFlatL2(d) + index.add(xb) + Dref, Iref = index.search(xq, 5) + + builder = DefaultIndex() + index2 = builder.build(d, xb) + Dnew, Inew = index2.search(xq, 5) + + assert np.all(Dnew == Dref) and np.all(Inew == Iref) + + def test_increase(self): + d = 64 + nb = 10000 + nq = 100 + _, xb, xq = get_dataset(d, nb, 500, nq) + + index = faiss.IndexFlatL2(d) + index.add(xb) + + pass + + def test_serialize(self): + pass + + +def get_dataset(d, nb, nt, nq): + """A dataset that is not completely random but still challenging to + index + """ + d1 = 10 # intrinsic dimension (more or less) + n = nb + nt + nq + rs = np.random.RandomState(1338) + x = rs.normal(size=(n, d1)) + x = np.dot(x, rs.rand(d1, d)) + # now we have a d1-dim ellipsoid in d-dimensional space + # higher factor (>4) -> higher frequency -> less linear + x = x * (rs.rand(d) * 4 + 0.1) + x = np.sin(x) + x = x.astype('float32') + return x[:nt], x[nt:-nq], x[-nq:] + + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/pyengine/engine/retrieval/tests/basic_test.py b/pyengine/engine/retrieval/tests/basic_test.py index 1d31a9fb77..9a5bd632b9 100644 --- a/pyengine/engine/retrieval/tests/basic_test.py +++ b/pyengine/engine/retrieval/tests/basic_test.py @@ -77,8 +77,7 @@ faiss.write_index(index, writer) ar_data = faiss.vector_to_array(writer.data) import pickle pickle.dump(ar_data, open("/tmp/faiss/ser_1", "wb")) - -#index_3 = pickle.load("/tmp/faiss/ser_1") +index_3 = pickle.load("/tmp/faiss/ser_1") # index_2 = faiss.IndexFlatL2(d) # build the index From 855d1c613d1e0b66424fe44ed1dab9024c1d54c5 Mon Sep 17 00:00:00 2001 From: "xj.lin" Date: Sun, 24 Mar 2019 20:36:28 +0800 Subject: [PATCH 2/8] add unittest for build/search index --- pyengine/engine/ingestion/tests/test_build.py | 33 +++++++++++-- .../engine/retrieval/tests/scheduler_test.py | 3 -- .../engine/retrieval/tests/test_search.py | 48 +++++++++++++++++++ 3 files changed, 77 insertions(+), 7 deletions(-) delete mode 100644 pyengine/engine/retrieval/tests/scheduler_test.py create mode 100644 pyengine/engine/retrieval/tests/test_search.py diff --git a/pyengine/engine/ingestion/tests/test_build.py b/pyengine/engine/ingestion/tests/test_build.py index 4dc90e95b1..ef349f905a 100644 --- a/pyengine/engine/ingestion/tests/test_build.py +++ b/pyengine/engine/ingestion/tests/test_build.py @@ -7,7 +7,9 @@ import unittest class TestBuildIndex(unittest.TestCase): def test_factory_method(self): - pass + index_builder = FactoryIndex() + index = index_builder() + self.assertIsInstance(index, DefaultIndex) def test_default_index(self): d = 64 @@ -30,15 +32,38 @@ class TestBuildIndex(unittest.TestCase): d = 64 nb = 10000 nq = 100 - _, xb, xq = get_dataset(d, nb, 500, nq) + nt = 500 + xt, xb, xq = get_dataset(d, nb, nt, nq) index = faiss.IndexFlatL2(d) index.add(xb) - pass + assert index.ntotal == nb + + Index.increase(index, xt) + assert index.ntotal == nb + nt def test_serialize(self): - pass + d = 64 + nb = 10000 + nq = 100 + nt = 500 + xt, xb, xq = get_dataset(d, nb, nt, nq) + + index = faiss.IndexFlatL2(d) + index.add(xb) + Dref, Iref = index.search(xq, 5) + + ar_data = Index.serialize(index) + + reader = faiss.VectorIOReader() + faiss.copy_array_to_vector(ar_data, reader.data) + index2 = faiss.read_index(reader) + + Dnew, Inew = index2.search(xq, 5) + + assert np.all(Dnew == Dref) and np.all(Inew == Iref) + def get_dataset(d, nb, nt, nq): diff --git a/pyengine/engine/retrieval/tests/scheduler_test.py b/pyengine/engine/retrieval/tests/scheduler_test.py deleted file mode 100644 index 71dc6e7ba6..0000000000 --- a/pyengine/engine/retrieval/tests/scheduler_test.py +++ /dev/null @@ -1,3 +0,0 @@ -from engine.controller import scheduler - -scheduler.Scheduler.Search() \ No newline at end of file diff --git a/pyengine/engine/retrieval/tests/test_search.py b/pyengine/engine/retrieval/tests/test_search.py new file mode 100644 index 0000000000..cd3ed927b8 --- /dev/null +++ b/pyengine/engine/retrieval/tests/test_search.py @@ -0,0 +1,48 @@ +from ..search_index import * + +import unittest +import numpy as np + +class TestSearchSingleThread(unittest.TestCase): + def test_search_by_vectors(self): + d = 64 + nb = 10000 + nq = 100 + _, xb, xq = get_dataset(d, nb, 500, nq) + + index = faiss.IndexFlatL2(d) + index.add(xb) + + # expect result + Dref, Iref = index.search(xq, 5) + + searcher = FaissSearch(index) + result = searcher.search_by_vectors(xq, 5) + + assert np.all(result.distance == Dref) \ + and np.all(result.vectors == Iref) + pass + + def test_top_k(selfs): + pass + + +def get_dataset(d, nb, nt, nq): + """A dataset that is not completely random but still challenging to + index + """ + d1 = 10 # intrinsic dimension (more or less) + n = nb + nt + nq + rs = np.random.RandomState(1338) + x = rs.normal(size=(n, d1)) + x = np.dot(x, rs.rand(d1, d)) + # now we have a d1-dim ellipsoid in d-dimensional space + # higher factor (>4) -> higher frequency -> less linear + x = x * (rs.rand(d) * 4 + 0.1) + x = np.sin(x) + x = x.astype('float32') + return x[:nt], x[nt:-nq], x[-nq:] + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From da4278779d64a0ce27b77a50fdcaecb63ecd52a5 Mon Sep 17 00:00:00 2001 From: "xj.lin" Date: Sun, 24 Mar 2019 21:08:27 +0800 Subject: [PATCH 3/8] add new test --- pyengine/engine/controller/tests/test_scheduler.py | 3 +++ pyengine/engine/controller/vector_engine.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 pyengine/engine/controller/tests/test_scheduler.py diff --git a/pyengine/engine/controller/tests/test_scheduler.py b/pyengine/engine/controller/tests/test_scheduler.py new file mode 100644 index 0000000000..98d2fe373e --- /dev/null +++ b/pyengine/engine/controller/tests/test_scheduler.py @@ -0,0 +1,3 @@ +import unittest +from ..scheduler import * + diff --git a/pyengine/engine/controller/vector_engine.py b/pyengine/engine/controller/vector_engine.py index b169ae551e..b4847936d7 100644 --- a/pyengine/engine/controller/vector_engine.py +++ b/pyengine/engine/controller/vector_engine.py @@ -98,7 +98,8 @@ class VectorEngine(object): # create index index_builder = build_index.FactoryIndex() - index = index_builder().build(d, raw_data) + index = index_builder().build(d, raw_data) # type: index + index = build_index.Index.serialize(index) # type: array # TODO(jinhai): store index into Cache index_filename = file.filename + '_index' From 925af7e1d2e3da69806d66ac4ee6b6a7d50756f9 Mon Sep 17 00:00:00 2001 From: "xj.lin" Date: Mon, 25 Mar 2019 12:00:24 +0800 Subject: [PATCH 4/8] add schuduler unittest and fix some bug --- pyengine/engine/controller/scheduler.py | 34 +++++------ .../engine/controller/tests/test_scheduler.py | 59 ++++++++++++++++++- pyengine/engine/controller/vector_engine.py | 18 +++--- pyengine/engine/ingestion/serialize.py | 7 +++ pyengine/engine/retrieval/search_index.py | 8 ++- 5 files changed, 98 insertions(+), 28 deletions(-) create mode 100644 pyengine/engine/ingestion/serialize.py diff --git a/pyengine/engine/controller/scheduler.py b/pyengine/engine/controller/scheduler.py index 9c7583c069..dd110b885b 100644 --- a/pyengine/engine/controller/scheduler.py +++ b/pyengine/engine/controller/scheduler.py @@ -1,5 +1,6 @@ from engine.retrieval import search_index from engine.ingestion import build_index +from engine.ingestion import serialize class Singleton(type): _instances = {} @@ -11,9 +12,9 @@ class Singleton(type): class Scheduler(metaclass=Singleton): def Search(self, index_file_key, vectors, k): - assert index_file_key - assert vectors - assert k + # assert index_file_key + # assert vectors + # assert k return self.__scheduler(index_file_key, vectors, k) @@ -21,30 +22,29 @@ class Scheduler(metaclass=Singleton): def __scheduler(self, index_data_key, vectors, k): result_list = [] - raw_data_list = index_data_key['raw'] - index_data_list = index_data_key['index'] + if 'raw' in index_data_key: + raw_vectors = index_data_key['raw'] + d = index_data_key['dimension'] - for key in raw_data_list: - raw_data, d = self.GetRawData(key) + if 'raw' in index_data_key: index_builder = build_index.FactoryIndex() - index = index_builder().build(d, raw_data) - searcher = search_index.FaissSearch(index) # silly + index = index_builder().build(d, raw_vectors) + searcher = search_index.FaissSearch(index) result_list.append(searcher.search_by_vectors(vectors, k)) + index_data_list = index_data_key['index'] for key in index_data_list: - index = self.GetIndexData(key) + index = GetIndexData(key) searcher = search_index.FaissSearch(index) result_list.append(searcher.search_by_vectors(vectors, k)) if len(result_list) == 1: return result_list[0].vectors - result = search_index.top_k(sum(result_list), k) - return result + + # result = search_index.top_k(result_list, k) + return result_list - def GetIndexData(self, key): - pass - - def GetRawData(self, key): - pass +def GetIndexData(key): + return serialize.read_index(key) \ No newline at end of file diff --git a/pyengine/engine/controller/tests/test_scheduler.py b/pyengine/engine/controller/tests/test_scheduler.py index 98d2fe373e..c63749b0d6 100644 --- a/pyengine/engine/controller/tests/test_scheduler.py +++ b/pyengine/engine/controller/tests/test_scheduler.py @@ -1,3 +1,60 @@ -import unittest from ..scheduler import * +import unittest +import faiss +import numpy as np + + +class TestScheduler(unittest.TestCase): + def test_schedule(self): + d = 64 + nb = 10000 + nq = 100 + nt = 5000 + xt, xb, xq = get_dataset(d, nb, nt, nq) + file_name = "/tmp/faiss/tempfile_1" + + + index = faiss.IndexFlatL2(d) + print(index.is_trained) + index.add(xb) + faiss.write_index(index, file_name) + Dref, Iref = index.search(xq, 5) + + index2 = faiss.read_index(file_name) + + schuduler_instance = Scheduler() + + # query args 1 + query_index = dict() + query_index['index'] = [file_name] + vectors = schuduler_instance.Search(query_index, vectors=xq, k=5) + assert np.all(vectors == Iref) + + # query args 2 + query_index = dict() + query_index['raw'] = xt + query_index['dimension'] = d + query_index['index'] = [file_name] + vectors = schuduler_instance.Search(query_index, vectors=xq, k=5) + # print("success") + + +def get_dataset(d, nb, nt, nq): + """A dataset that is not completely random but still challenging to + index + """ + d1 = 10 # intrinsic dimension (more or less) + n = nb + nt + nq + rs = np.random.RandomState(1338) + x = rs.normal(size=(n, d1)) + x = np.dot(x, rs.rand(d1, d)) + # now we have a d1-dim ellipsoid in d-dimensional space + # higher factor (>4) -> higher frequency -> less linear + x = x * (rs.rand(d) * 4 + 0.1) + x = np.sin(x) + x = x.astype('float32') + return x[:nt], x[nt:-nq], x[-nq:] + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/pyengine/engine/controller/vector_engine.py b/pyengine/engine/controller/vector_engine.py index b4847936d7..7af277158e 100644 --- a/pyengine/engine/controller/vector_engine.py +++ b/pyengine/engine/controller/vector_engine.py @@ -8,6 +8,7 @@ from flask import jsonify from engine import db from engine.ingestion import build_index from engine.controller.scheduler import Scheduler +from engine.ingestion import serialize import sys, os class VectorEngine(object): @@ -98,14 +99,15 @@ class VectorEngine(object): # create index index_builder = build_index.FactoryIndex() - index = index_builder().build(d, raw_data) # type: index - index = build_index.Index.serialize(index) # type: array + index = index_builder().build(d, raw_data) # TODO(jinhai): store index into Cache index_filename = file.filename + '_index' + serialize.write_index(file_name=index_filename, index=index) - # TODO(jinhai): Update raw_file_name => index_file_name - FileTable.query.filter(FileTable.group_name == group_id).filter(FileTable.type == 'raw').update({'row_number':file.row_number + 1, 'type': 'index'}) + FileTable.query.filter(FileTable.group_name == group_id).filter(FileTable.type == 'raw').update({'row_number':file.row_number + 1, + 'type': 'index', + 'filename': index_filename}) pass else: @@ -135,13 +137,15 @@ class VectorEngine(object): if code == VectorEngine.FAULT_CODE: return VectorEngine.GROUP_NOT_EXIST + group = GroupTable.query.filter(GroupTable.group_name == group_id).first() + # find all files files = FileTable.query.filter(FileTable.group_name == group_id).all() - raw_keys = [ i.filename for i in files if i.type == 'raw' ] index_keys = [ i.filename for i in files if i.type == 'index' ] index_map = {} - index_map['raw'] = raw_keys - index_map['index'] = index_keys # {raw:[key1, key2], index:[key3, key4]} + index_map['index'] = index_keys + index_map['raw'] = GetVectorListFromRawFile(group_id) + index_map['dimension'] = group.dimension scheduler_instance = Scheduler() result = scheduler_instance.Search(index_map, vector, limit) diff --git a/pyengine/engine/ingestion/serialize.py b/pyengine/engine/ingestion/serialize.py new file mode 100644 index 0000000000..e877b0b91a --- /dev/null +++ b/pyengine/engine/ingestion/serialize.py @@ -0,0 +1,7 @@ +import faiss + +def write_index(index, file_name): + faiss.write_index(index, file_name) + +def read_index(file_name): + return faiss.read_index(file_name) \ No newline at end of file diff --git a/pyengine/engine/retrieval/search_index.py b/pyengine/engine/retrieval/search_index.py index 3137816181..4b6f2ffe42 100644 --- a/pyengine/engine/retrieval/search_index.py +++ b/pyengine/engine/retrieval/search_index.py @@ -7,8 +7,9 @@ class SearchResult(): self.vectors = I def __add__(self, other): - self.distance += other.distance - self.vectors += other.vectors + distance = self.distance + other.distance + vectors = self.vectors + other.vectors + return SearchResult(distance, vectors) class FaissSearch(): @@ -31,6 +32,7 @@ class FaissSearch(): D, I = self.__index.search(vector_list, k) return SearchResult(D, I) - +import heapq def top_k(input, k): + #sorted = heapq.nsmallest(k, input, key=input.key) pass \ No newline at end of file From 5616ec74db065b50df9dfa749ced4bac164694c5 Mon Sep 17 00:00:00 2001 From: "xj.lin" Date: Mon, 25 Mar 2019 16:11:15 +0800 Subject: [PATCH 5/8] fix much bug --- pyengine/engine/controller/scheduler.py | 13 ++++++++++-- .../controller/tests/test_vector_engine.py | 21 ++++++++++++++++--- .../engine/controller/tests/test_views.py | 14 +++++++++++-- pyengine/engine/controller/vector_engine.py | 5 +++-- pyengine/engine/controller/views.py | 4 ++-- pyengine/engine/ingestion/build_index.py | 2 +- pyengine/engine/ingestion/serialize.py | 6 +++++- pyengine/engine/model/group_table.py | 1 - pyengine/engine/settings.py | 2 +- 9 files changed, 53 insertions(+), 15 deletions(-) diff --git a/pyengine/engine/controller/scheduler.py b/pyengine/engine/controller/scheduler.py index dd110b885b..c88f734840 100644 --- a/pyengine/engine/controller/scheduler.py +++ b/pyengine/engine/controller/scheduler.py @@ -1,6 +1,7 @@ from engine.retrieval import search_index from engine.ingestion import build_index from engine.ingestion import serialize +import numpy as np class Singleton(type): _instances = {} @@ -22,25 +23,33 @@ class Scheduler(metaclass=Singleton): def __scheduler(self, index_data_key, vectors, k): result_list = [] + d = None + raw_vectors = None + print("__scheduler: vectors: ", vectors) + query_vectors = np.asarray(vectors).astype('float32') + if 'raw' in index_data_key: raw_vectors = index_data_key['raw'] + raw_vectors = np.asarray(raw_vectors).astype('float32') d = index_data_key['dimension'] if 'raw' in index_data_key: index_builder = build_index.FactoryIndex() + print("d: ", d, " raw_vectors: ", raw_vectors) index = index_builder().build(d, raw_vectors) searcher = search_index.FaissSearch(index) - result_list.append(searcher.search_by_vectors(vectors, k)) + result_list.append(searcher.search_by_vectors(query_vectors, k)) index_data_list = index_data_key['index'] for key in index_data_list: index = GetIndexData(key) searcher = search_index.FaissSearch(index) - result_list.append(searcher.search_by_vectors(vectors, k)) + result_list.append(searcher.search_by_vectors(query_vectors, k)) if len(result_list) == 1: return result_list[0].vectors + total_result = [] # result = search_index.top_k(result_list, k) return result_list diff --git a/pyengine/engine/controller/tests/test_vector_engine.py b/pyengine/engine/controller/tests/test_vector_engine.py index 085a11d94f..0a7d193482 100644 --- a/pyengine/engine/controller/tests/test_vector_engine.py +++ b/pyengine/engine/controller/tests/test_vector_engine.py @@ -11,7 +11,9 @@ logger = logging.getLogger(__name__) class TestVectorEngine: def setup_class(self): self.__vector = [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8] - self.__limit = 3 + self.__vector_2 = [1.2, 2.2, 3.3, 4.5, 5.5, 6.6, 7.8, 8.8] + self.__query_vector = [[1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8],[1.2, 2.2, 3.3, 4.5, 5.5, 6.6, 7.8, 8.8]] + self.__limit = 1 def teardown_class(self): @@ -39,6 +41,7 @@ class TestVectorEngine: # Check the group list code, group_list = VectorEngine.GetGroupList() assert code == VectorEngine.SUCCESS_CODE + print("group_list: ", group_list) assert group_list == [{'group_name': 'test_group', 'file_number': 0}] # Add Vector for not exist group @@ -46,11 +49,23 @@ class TestVectorEngine: assert code == VectorEngine.GROUP_NOT_EXIST # Add vector for exist group - code = VectorEngine.AddVector('test_group', self.__vector) + code = VectorEngine.AddVector('test_group', self.__vector_2) + assert code == VectorEngine.SUCCESS_CODE + + # Add vector for exist group + code = VectorEngine.AddVector('test_group', self.__vector_2) + assert code == VectorEngine.SUCCESS_CODE + + # Add vector for exist group + code = VectorEngine.AddVector('test_group', self.__vector_2) + assert code == VectorEngine.SUCCESS_CODE + + # Add vector for exist group + code = VectorEngine.AddVector('test_group', self.__vector_2) assert code == VectorEngine.SUCCESS_CODE # Check search vector interface - code, vector_id = VectorEngine.SearchVector('test_group', self.__vector, self.__limit) + code, vector_id = VectorEngine.SearchVector('test_group', self.__query_vector, self.__limit) assert code == VectorEngine.SUCCESS_CODE assert vector_id == 0 diff --git a/pyengine/engine/controller/tests/test_views.py b/pyengine/engine/controller/tests/test_views.py index 6ae2bc6b4b..86584e28cc 100644 --- a/pyengine/engine/controller/tests/test_views.py +++ b/pyengine/engine/controller/tests/test_views.py @@ -52,12 +52,22 @@ class TestViews: assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 + vector = {"vector": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8]} + resp = test_client.post('/vector/add/6', data=json.dumps(vector)) + assert resp.status_code == 200 + assert self.loads(resp)['code'] == 0 + + vector = {"vector": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8]} + resp = test_client.post('/vector/add/6', data=json.dumps(vector)) + assert resp.status_code == 200 + assert self.loads(resp)['code'] == 0 + resp = test_client.post('/vector/index/6') assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 - limit = {"vector": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], "limit": 3} - resp = test_client.get('/vector/search/6', data=json.dumps(limit)) + limit = {"vector": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], "limit": 1} + resp = test_client.post('/vector/search/6', data=json.dumps(limit)) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 assert self.loads(resp)['vector_id'] == 0 diff --git a/pyengine/engine/controller/vector_engine.py b/pyengine/engine/controller/vector_engine.py index 7af277158e..0d1e32f7b8 100644 --- a/pyengine/engine/controller/vector_engine.py +++ b/pyengine/engine/controller/vector_engine.py @@ -94,7 +94,7 @@ class VectorEngine(object): # check if the file can be indexed if file.row_number + 1 >= ROW_LIMIT: - raw_data = GetVectorListFromRawFile(group_id) + raw_data = VectorEngine.GetVectorListFromRawFile(group_id) d = group.dimension # create index @@ -144,7 +144,7 @@ class VectorEngine(object): index_keys = [ i.filename for i in files if i.type == 'index' ] index_map = {} index_map['index'] = index_keys - index_map['raw'] = GetVectorListFromRawFile(group_id) + index_map['raw'] = VectorEngine.GetVectorListFromRawFile(group_id, "fakename") index_map['dimension'] = group.dimension scheduler_instance = Scheduler() @@ -189,6 +189,7 @@ class VectorEngine(object): @staticmethod def GetVectorListFromRawFile(group_id, filename="todo"): return VectorEngine.group_dict[group_id] + # return serialize.to_array(VectorEngine.group_dict[group_id]) @staticmethod def ClearRawFile(group_id): diff --git a/pyengine/engine/controller/views.py b/pyengine/engine/controller/views.py index 87e51f1ee5..477bb32e49 100644 --- a/pyengine/engine/controller/views.py +++ b/pyengine/engine/controller/views.py @@ -28,9 +28,9 @@ class VectorSearch(Resource): self.__parser.add_argument('vector', type=float, action='append', location=['json']) self.__parser.add_argument('limit', type=int, action='append', location=['json']) - def get(self, group_id): + def post(self, group_id): args = self.__parser.parse_args() - print('vector: ', args['vector']) + print('VectorSearch vector: ', args['vector']) # go to search every thing code, vector_id = VectorEngine.SearchVector(group_id, args['vector'], args['limit']) return jsonify({'code': code, 'vector_id': vector_id}) diff --git a/pyengine/engine/ingestion/build_index.py b/pyengine/engine/ingestion/build_index.py index 68db892dd4..8e228bb0a7 100644 --- a/pyengine/engine/ingestion/build_index.py +++ b/pyengine/engine/ingestion/build_index.py @@ -15,7 +15,7 @@ def FactoryIndex(index_name="DefaultIndex"): class Index(): - def build(d, vectors, DEVICE=INDEX_DEVICES.CPU): + def build(self, d, vectors, DEVICE=INDEX_DEVICES.CPU): pass @staticmethod diff --git a/pyengine/engine/ingestion/serialize.py b/pyengine/engine/ingestion/serialize.py index e877b0b91a..bac58b66da 100644 --- a/pyengine/engine/ingestion/serialize.py +++ b/pyengine/engine/ingestion/serialize.py @@ -1,7 +1,11 @@ import faiss +import numpy as np def write_index(index, file_name): faiss.write_index(index, file_name) def read_index(file_name): - return faiss.read_index(file_name) \ No newline at end of file + return faiss.read_index(file_name) + +def to_array(vec): + return np.asarray(vec).astype('float32') \ No newline at end of file diff --git a/pyengine/engine/model/group_table.py b/pyengine/engine/model/group_table.py index 11f5674e4b..3fe7d4f337 100644 --- a/pyengine/engine/model/group_table.py +++ b/pyengine/engine/model/group_table.py @@ -12,7 +12,6 @@ class GroupTable(db.Model): self.group_name = group_name self.dimension = dimension self.file_number = 0 - self.dimension = 0 def __repr__(self): diff --git a/pyengine/engine/settings.py b/pyengine/engine/settings.py index 052fca6115..a73af880b2 100644 --- a/pyengine/engine/settings.py +++ b/pyengine/engine/settings.py @@ -6,4 +6,4 @@ SQLALCHEMY_TRACK_MODIFICATIONS = False SQLALCHEMY_DATABASE_URI = "mysql+pymysql://vecwise@127.0.0.1:3306/vecdata" ROW_LIMIT = 10000000 -DATABASE_DIRECTORY = '/home/jinhai/disk0/vecwise/db' \ No newline at end of file +DATABASE_DIRECTORY = '/tmp' \ No newline at end of file From d490058068d6a01cd46e870cadc7321fdf459029 Mon Sep 17 00:00:00 2001 From: jinhai Date: Mon, 25 Mar 2019 16:32:18 +0800 Subject: [PATCH 6/8] Fix bugs --- .../engine/controller/tests/test_views.py | 23 ++++++++++--------- pyengine/engine/controller/views.py | 3 ++- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pyengine/engine/controller/tests/test_views.py b/pyengine/engine/controller/tests/test_views.py index 86584e28cc..eb66454676 100644 --- a/pyengine/engine/controller/tests/test_views.py +++ b/pyengine/engine/controller/tests/test_views.py @@ -11,6 +11,7 @@ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(le logger = logging.getLogger(__name__) class TestViews: + HEADERS = {'Content-Type': 'application/json'} def loads(self, resp): return json.loads(resp.data.decode()) @@ -18,25 +19,25 @@ class TestViews: def test_group(self, test_client): data = {"dimension": 10} - resp = test_client.get('/vector/group/6') + resp = test_client.get('/vector/group/6', headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 1 - resp = test_client.post('/vector/group/6', data=json.dumps(data)) + resp = test_client.post('/vector/group/6', data=json.dumps(data), headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 - resp = test_client.get('/vector/group/6') + resp = test_client.get('/vector/group/6', headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 # GroupList - resp = test_client.get('/vector/group') + resp = test_client.get('/vector/group', headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 assert self.loads(resp)['group_list'] == [{'file_number': 0, 'group_name': '6'}] - resp = test_client.delete('/vector/group/6') + resp = test_client.delete('/vector/group/6', headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 @@ -48,31 +49,31 @@ class TestViews: assert self.loads(resp)['code'] == 0 vector = {"vector": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8]} - resp = test_client.post('/vector/add/6', data=json.dumps(vector)) + resp = test_client.post('/vector/add/6', data=json.dumps(vector), headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 vector = {"vector": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8]} - resp = test_client.post('/vector/add/6', data=json.dumps(vector)) + resp = test_client.post('/vector/add/6', data=json.dumps(vector), headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 vector = {"vector": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8]} - resp = test_client.post('/vector/add/6', data=json.dumps(vector)) + resp = test_client.post('/vector/add/6', data=json.dumps(vector), headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 - resp = test_client.post('/vector/index/6') + resp = test_client.post('/vector/index/6', headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 limit = {"vector": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], "limit": 1} - resp = test_client.post('/vector/search/6', data=json.dumps(limit)) + resp = test_client.get('/vector/search/6', data=json.dumps(limit), headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 assert self.loads(resp)['vector_id'] == 0 - resp = test_client.delete('/vector/group/6') + resp = test_client.delete('/vector/group/6', headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 diff --git a/pyengine/engine/controller/views.py b/pyengine/engine/controller/views.py index 477bb32e49..3d596fb6d5 100644 --- a/pyengine/engine/controller/views.py +++ b/pyengine/engine/controller/views.py @@ -28,9 +28,10 @@ class VectorSearch(Resource): self.__parser.add_argument('vector', type=float, action='append', location=['json']) self.__parser.add_argument('limit', type=int, action='append', location=['json']) - def post(self, group_id): + def get(self, group_id): args = self.__parser.parse_args() print('VectorSearch vector: ', args['vector']) + print('limit: ', args['limit']) # go to search every thing code, vector_id = VectorEngine.SearchVector(group_id, args['vector'], args['limit']) return jsonify({'code': code, 'vector_id': vector_id}) From dbfb4f20574f92362a7127a24fbced07d322b8f9 Mon Sep 17 00:00:00 2001 From: "xj.lin" Date: Mon, 25 Mar 2019 16:34:40 +0800 Subject: [PATCH 7/8] add list => array --- pyengine/engine/controller/scheduler.py | 21 +++++++------------ .../controller/tests/test_vector_engine.py | 5 ++++- pyengine/engine/controller/vector_engine.py | 5 ++--- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pyengine/engine/controller/scheduler.py b/pyengine/engine/controller/scheduler.py index c88f734840..3eab1b49b3 100644 --- a/pyengine/engine/controller/scheduler.py +++ b/pyengine/engine/controller/scheduler.py @@ -1,7 +1,7 @@ from engine.retrieval import search_index from engine.ingestion import build_index from engine.ingestion import serialize -import numpy as np + class Singleton(type): _instances = {} @@ -15,36 +15,29 @@ class Scheduler(metaclass=Singleton): def Search(self, index_file_key, vectors, k): # assert index_file_key # assert vectors - # assert k + assert k != 0 - return self.__scheduler(index_file_key, vectors, k) + query_vectors = serialize.to_array(vectors) + + return self.__scheduler(index_file_key, query_vectors, k) def __scheduler(self, index_data_key, vectors, k): result_list = [] - d = None - raw_vectors = None - print("__scheduler: vectors: ", vectors) - query_vectors = np.asarray(vectors).astype('float32') - if 'raw' in index_data_key: raw_vectors = index_data_key['raw'] - raw_vectors = np.asarray(raw_vectors).astype('float32') d = index_data_key['dimension'] - - if 'raw' in index_data_key: index_builder = build_index.FactoryIndex() - print("d: ", d, " raw_vectors: ", raw_vectors) index = index_builder().build(d, raw_vectors) searcher = search_index.FaissSearch(index) - result_list.append(searcher.search_by_vectors(query_vectors, k)) + result_list.append(searcher.search_by_vectors(vectors, k)) index_data_list = index_data_key['index'] for key in index_data_list: index = GetIndexData(key) searcher = search_index.FaissSearch(index) - result_list.append(searcher.search_by_vectors(query_vectors, k)) + result_list.append(searcher.search_by_vectors(vectors, k)) if len(result_list) == 1: return result_list[0].vectors diff --git a/pyengine/engine/controller/tests/test_vector_engine.py b/pyengine/engine/controller/tests/test_vector_engine.py index 0a7d193482..25eb5a9ed6 100644 --- a/pyengine/engine/controller/tests/test_vector_engine.py +++ b/pyengine/engine/controller/tests/test_vector_engine.py @@ -3,6 +3,7 @@ from engine.settings import DATABASE_DIRECTORY from flask import jsonify import pytest import os +import numpy as np import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') @@ -104,10 +105,12 @@ class TestVectorEngine: expected_list = [self.__vector] vector_list = VectorEngine.GetVectorListFromRawFile('test_group', filename) + print('expected_list: ', expected_list) print('vector_list: ', vector_list) + expected_list = np.asarray(expected_list).astype('float32') - assert vector_list == expected_list + assert np.all(vector_list == expected_list) code = VectorEngine.ClearRawFile('test_group') assert code == VectorEngine.SUCCESS_CODE diff --git a/pyengine/engine/controller/vector_engine.py b/pyengine/engine/controller/vector_engine.py index 0d1e32f7b8..9a44b2c02b 100644 --- a/pyengine/engine/controller/vector_engine.py +++ b/pyengine/engine/controller/vector_engine.py @@ -144,7 +144,7 @@ class VectorEngine(object): index_keys = [ i.filename for i in files if i.type == 'index' ] index_map = {} index_map['index'] = index_keys - index_map['raw'] = VectorEngine.GetVectorListFromRawFile(group_id, "fakename") + index_map['raw'] = VectorEngine.GetVectorListFromRawFile(group_id, "fakename") #TODO: pass by key, get from storage index_map['dimension'] = group.dimension scheduler_instance = Scheduler() @@ -188,8 +188,7 @@ class VectorEngine(object): @staticmethod def GetVectorListFromRawFile(group_id, filename="todo"): - return VectorEngine.group_dict[group_id] - # return serialize.to_array(VectorEngine.group_dict[group_id]) + return serialize.to_array(VectorEngine.group_dict[group_id]) @staticmethod def ClearRawFile(group_id): From 06ca60aa8d1b323aedcef237113e9f47f85afd36 Mon Sep 17 00:00:00 2001 From: "xj.lin" Date: Mon, 25 Mar 2019 19:20:35 +0800 Subject: [PATCH 8/8] unittest pass --- pyengine/engine/controller/scheduler.py | 12 ++++++------ .../engine/controller/tests/test_vector_engine.py | 12 +++++------- pyengine/engine/controller/tests/test_views.py | 4 ++-- pyengine/engine/controller/vector_engine.py | 4 +++- pyengine/engine/controller/views.py | 4 ++-- pyengine/engine/retrieval/tests/scheduler_test.py | 3 --- 6 files changed, 18 insertions(+), 21 deletions(-) delete mode 100644 pyengine/engine/retrieval/tests/scheduler_test.py diff --git a/pyengine/engine/controller/scheduler.py b/pyengine/engine/controller/scheduler.py index 3eab1b49b3..275284da6b 100644 --- a/pyengine/engine/controller/scheduler.py +++ b/pyengine/engine/controller/scheduler.py @@ -18,7 +18,6 @@ class Scheduler(metaclass=Singleton): assert k != 0 query_vectors = serialize.to_array(vectors) - return self.__scheduler(index_file_key, query_vectors, k) @@ -33,11 +32,12 @@ class Scheduler(metaclass=Singleton): searcher = search_index.FaissSearch(index) result_list.append(searcher.search_by_vectors(vectors, k)) - index_data_list = index_data_key['index'] - for key in index_data_list: - index = GetIndexData(key) - searcher = search_index.FaissSearch(index) - result_list.append(searcher.search_by_vectors(vectors, k)) + if 'index' in index_data_key: + index_data_list = index_data_key['index'] + for key in index_data_list: + index = GetIndexData(key) + searcher = search_index.FaissSearch(index) + result_list.append(searcher.search_by_vectors(vectors, k)) if len(result_list) == 1: return result_list[0].vectors diff --git a/pyengine/engine/controller/tests/test_vector_engine.py b/pyengine/engine/controller/tests/test_vector_engine.py index 25eb5a9ed6..5bd957ecb5 100644 --- a/pyengine/engine/controller/tests/test_vector_engine.py +++ b/pyengine/engine/controller/tests/test_vector_engine.py @@ -12,8 +12,6 @@ logger = logging.getLogger(__name__) class TestVectorEngine: def setup_class(self): self.__vector = [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8] - self.__vector_2 = [1.2, 2.2, 3.3, 4.5, 5.5, 6.6, 7.8, 8.8] - self.__query_vector = [[1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8],[1.2, 2.2, 3.3, 4.5, 5.5, 6.6, 7.8, 8.8]] self.__limit = 1 @@ -50,23 +48,23 @@ class TestVectorEngine: assert code == VectorEngine.GROUP_NOT_EXIST # Add vector for exist group - code = VectorEngine.AddVector('test_group', self.__vector_2) + code = VectorEngine.AddVector('test_group', self.__vector) assert code == VectorEngine.SUCCESS_CODE # Add vector for exist group - code = VectorEngine.AddVector('test_group', self.__vector_2) + code = VectorEngine.AddVector('test_group', self.__vector) assert code == VectorEngine.SUCCESS_CODE # Add vector for exist group - code = VectorEngine.AddVector('test_group', self.__vector_2) + code = VectorEngine.AddVector('test_group', self.__vector) assert code == VectorEngine.SUCCESS_CODE # Add vector for exist group - code = VectorEngine.AddVector('test_group', self.__vector_2) + code = VectorEngine.AddVector('test_group', self.__vector) assert code == VectorEngine.SUCCESS_CODE # Check search vector interface - code, vector_id = VectorEngine.SearchVector('test_group', self.__query_vector, self.__limit) + code, vector_id = VectorEngine.SearchVector('test_group', self.__vector, self.__limit) assert code == VectorEngine.SUCCESS_CODE assert vector_id == 0 diff --git a/pyengine/engine/controller/tests/test_views.py b/pyengine/engine/controller/tests/test_views.py index eb66454676..1f28cb1b94 100644 --- a/pyengine/engine/controller/tests/test_views.py +++ b/pyengine/engine/controller/tests/test_views.py @@ -43,8 +43,8 @@ class TestViews: def test_vector(self, test_client): - dimension = {"dimension": 10} - resp = test_client.post('/vector/group/6', data=json.dumps(dimension)) + dimension = {"dimension": 8} + resp = test_client.post('/vector/group/6', data=json.dumps(dimension), headers = TestViews.HEADERS) assert resp.status_code == 200 assert self.loads(resp)['code'] == 0 diff --git a/pyengine/engine/controller/vector_engine.py b/pyengine/engine/controller/vector_engine.py index 9a44b2c02b..50a7e98046 100644 --- a/pyengine/engine/controller/vector_engine.py +++ b/pyengine/engine/controller/vector_engine.py @@ -148,7 +148,9 @@ class VectorEngine(object): index_map['dimension'] = group.dimension scheduler_instance = Scheduler() - result = scheduler_instance.Search(index_map, vector, limit) + vectors = [] + vectors.append(vector) + result = scheduler_instance.Search(index_map, vectors, limit) vector_id = 0 diff --git a/pyengine/engine/controller/views.py b/pyengine/engine/controller/views.py index 3d596fb6d5..425b337d3c 100644 --- a/pyengine/engine/controller/views.py +++ b/pyengine/engine/controller/views.py @@ -26,7 +26,7 @@ class VectorSearch(Resource): def __init__(self): self.__parser = reqparse.RequestParser() self.__parser.add_argument('vector', type=float, action='append', location=['json']) - self.__parser.add_argument('limit', type=int, action='append', location=['json']) + self.__parser.add_argument('limit', type=int, location=['json']) def get(self, group_id): args = self.__parser.parse_args() @@ -51,7 +51,7 @@ class Group(Resource): def __init__(self): self.__parser = reqparse.RequestParser() self.__parser.add_argument('group_id', type=str) - self.__parser.add_argument('dimension', type=int, action='append', location=['json']) + self.__parser.add_argument('dimension', type=int, location=['json']) def post(self, group_id): args = self.__parser.parse_args() diff --git a/pyengine/engine/retrieval/tests/scheduler_test.py b/pyengine/engine/retrieval/tests/scheduler_test.py deleted file mode 100644 index 2644cbec02..0000000000 --- a/pyengine/engine/retrieval/tests/scheduler_test.py +++ /dev/null @@ -1,3 +0,0 @@ -from engine.controller import scheduler - -# scheduler.Scheduler.Search() \ No newline at end of file