From 39be106baa872cdee3dbf63999c083754f55f2f1 Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Tue, 30 Apr 2019 14:34:04 +0800 Subject: [PATCH 1/6] refactor(db): code optimization Former-commit-id: 320733b50aba32e5bd99853933e7e1eafdbcdbcf --- cpp/src/db/DBImpl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index 88bd9b8d95..859a1d75aa 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -228,7 +228,7 @@ Status DBImpl::merge_files(const std::string& group_id, const meta::DateT& date, for (auto& file : files) { auto to_merge = zilliz::vecwise::cache::CpuCacheMgr::GetInstance()->GetIndex(file.location); if (!to_merge) { - to_merge = read_index(file.location.c_str()); + to_merge = read_index(file.location); } auto file_index = dynamic_cast(to_merge->data().get()); index->add_with_ids(file_index->ntotal, dynamic_cast(file_index->index)->xb.data(), From cf19e90af396e18f27877e2394db4aa7845faa3e Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Tue, 30 Apr 2019 15:19:17 +0800 Subject: [PATCH 2/6] feat(db): add serializer Former-commit-id: ab412bbb430e7711ddee0ad26b34f7b3b6c43582 --- cpp/src/db/FaissSerializer.cpp | 24 ++++++++++++++++++++++++ cpp/src/db/FaissSerializer.h | 28 ++++++++++++++++++++++++++++ cpp/src/db/Serializer.cpp | 21 +++++++++++++++++++++ cpp/src/db/Serializer.h | 23 +++++++++++++++++++++++ 4 files changed, 96 insertions(+) create mode 100644 cpp/src/db/FaissSerializer.cpp create mode 100644 cpp/src/db/FaissSerializer.h create mode 100644 cpp/src/db/Serializer.cpp create mode 100644 cpp/src/db/Serializer.h diff --git a/cpp/src/db/FaissSerializer.cpp b/cpp/src/db/FaissSerializer.cpp new file mode 100644 index 0000000000..6bc8b487a1 --- /dev/null +++ b/cpp/src/db/FaissSerializer.cpp @@ -0,0 +1,24 @@ +#include +#include + +#include "FaissSerializer.h" + +namespace zilliz { +namespace vecwise { +namespace engine { + +const std::string IndexType = "IDMap,Flat"; + +FaissSerializer::FaissSerializer(uint16_t dimension) + : pIndex_(faiss::index_factory(dimension, IndexType.c_str())) { +} + +bool FaissSerializer::AddWithIds(long n, const float *xdata, const long *xids) { + pIndex_->add_with_ids(n, xdata, xids); + return true; +} + + +} // namespace engine +} // namespace vecwise +} // namespace zilliz diff --git a/cpp/src/db/FaissSerializer.h b/cpp/src/db/FaissSerializer.h new file mode 100644 index 0000000000..fa13dad0a1 --- /dev/null +++ b/cpp/src/db/FaissSerializer.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +#include "Serializer.h" + +namespace faiss { + class Index; +} + +namespace zilliz { +namespace vecwise { +namespace engine { + +class FaissSerializer : public Serializer { +public: + FaissSerializer(uint16_t dimension); + virtual bool AddWithIds(long n, const float *xdata, const long *xids) override; + +protected: + std::shared_ptr pIndex_; +}; + + +} // namespace engine +} // namespace vecwise +} // namespace zilliz diff --git a/cpp/src/db/Serializer.cpp b/cpp/src/db/Serializer.cpp new file mode 100644 index 0000000000..5a60defd50 --- /dev/null +++ b/cpp/src/db/Serializer.cpp @@ -0,0 +1,21 @@ +#include +#include "Serializer.h" + +namespace zilliz { +namespace vecwise { +namespace engine { + +bool Serializer::AddWithIds(const std::vector& vectors, const std::vector& vector_ids) { + long n1 = (long)vectors.size(); + long n2 = (long)vector_ids.size(); + if (n1 != n2) { + LOG(ERROR) << "vectors size is not equal to the size of vector_ids: " << n1 << "!=" << n2; + return false; + } + return AddWithIds(n1, vectors.data(), vector_ids.data()); +} + + +} // namespace engine +} // namespace vecwise +} // namespace zilliz diff --git a/cpp/src/db/Serializer.h b/cpp/src/db/Serializer.h new file mode 100644 index 0000000000..b7760fe9bc --- /dev/null +++ b/cpp/src/db/Serializer.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace zilliz { +namespace vecwise { +namespace engine { + +class Serializer { +public: + + bool AddWithIds(const std::vector& vectors, + const std::vector& vector_ids); + + virtual bool AddWithIds(long n, const float *xdata, const long *xids) = 0; + + virtual ~Serializer() {} +}; + + +} // namespace engine +} // namespace vecwise +} // namespace zilliz From 052f7e2f11c7f39d1039a2118c7ed5f056c41f60 Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Tue, 30 Apr 2019 15:43:08 +0800 Subject: [PATCH 3/6] feat(db): add more api for serializer Former-commit-id: d15d7dfecb9964ca2e3ba4e5b469137d1cc85057 --- cpp/src/db/FaissSerializer.cpp | 23 +++++++++++++++++++---- cpp/src/db/FaissSerializer.h | 11 +++++++++-- cpp/src/db/Serializer.cpp | 4 ++-- cpp/src/db/Serializer.h | 12 ++++++++++-- cpp/src/db/Status.h | 5 +++++ 5 files changed, 45 insertions(+), 10 deletions(-) diff --git a/cpp/src/db/FaissSerializer.cpp b/cpp/src/db/FaissSerializer.cpp index 6bc8b487a1..fee7750469 100644 --- a/cpp/src/db/FaissSerializer.cpp +++ b/cpp/src/db/FaissSerializer.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "FaissSerializer.h" @@ -9,13 +10,27 @@ namespace engine { const std::string IndexType = "IDMap,Flat"; -FaissSerializer::FaissSerializer(uint16_t dimension) - : pIndex_(faiss::index_factory(dimension, IndexType.c_str())) { +FaissSerializer::FaissSerializer(uint16_t dimension, const std::string& location) + : pIndex_(faiss::index_factory(dimension, IndexType.c_str())), + location_(location) { } -bool FaissSerializer::AddWithIds(long n, const float *xdata, const long *xids) { +Status FaissSerializer::AddWithIds(long n, const float *xdata, const long *xids) { pIndex_->add_with_ids(n, xdata, xids); - return true; + return Status::OK(); +} + +size_t FaissSerializer::Count() const { + return (size_t)(pIndex_->ntotal); +} + +size_t FaissSerializer::Size() const { + return (size_t)(Count() * pIndex_->d); +} + +Status FaissSerializer::Serialize() { + write_index(pIndex_.get(), location_.c_str()); + return Status::OK(); } diff --git a/cpp/src/db/FaissSerializer.h b/cpp/src/db/FaissSerializer.h index fa13dad0a1..a56779996e 100644 --- a/cpp/src/db/FaissSerializer.h +++ b/cpp/src/db/FaissSerializer.h @@ -15,11 +15,18 @@ namespace engine { class FaissSerializer : public Serializer { public: - FaissSerializer(uint16_t dimension); - virtual bool AddWithIds(long n, const float *xdata, const long *xids) override; + FaissSerializer(uint16_t dimension, const std::string& location); + virtual Status AddWithIds(long n, const float *xdata, const long *xids) override; + + virtual size_t Count() const override; + + virtual size_t Size() const override; + + virtual Status Serialize() override; protected: std::shared_ptr pIndex_; + std::string location_; }; diff --git a/cpp/src/db/Serializer.cpp b/cpp/src/db/Serializer.cpp index 5a60defd50..595cd4731c 100644 --- a/cpp/src/db/Serializer.cpp +++ b/cpp/src/db/Serializer.cpp @@ -5,12 +5,12 @@ namespace zilliz { namespace vecwise { namespace engine { -bool Serializer::AddWithIds(const std::vector& vectors, const std::vector& vector_ids) { +Status Serializer::AddWithIds(const std::vector& vectors, const std::vector& vector_ids) { long n1 = (long)vectors.size(); long n2 = (long)vector_ids.size(); if (n1 != n2) { LOG(ERROR) << "vectors size is not equal to the size of vector_ids: " << n1 << "!=" << n2; - return false; + return Status::Error("Error: AddWithIds"); } return AddWithIds(n1, vectors.data(), vector_ids.data()); } diff --git a/cpp/src/db/Serializer.h b/cpp/src/db/Serializer.h index b7760fe9bc..cb2891be2e 100644 --- a/cpp/src/db/Serializer.h +++ b/cpp/src/db/Serializer.h @@ -2,6 +2,8 @@ #include +#include "Status.h" + namespace zilliz { namespace vecwise { namespace engine { @@ -9,10 +11,16 @@ namespace engine { class Serializer { public: - bool AddWithIds(const std::vector& vectors, + Status AddWithIds(const std::vector& vectors, const std::vector& vector_ids); - virtual bool AddWithIds(long n, const float *xdata, const long *xids) = 0; + virtual Status AddWithIds(long n, const float *xdata, const long *xids) = 0; + + virtual size_t Count() const = 0; + + virtual size_t Size() const = 0; + + virtual Status Serialize() = 0; virtual ~Serializer() {} }; diff --git a/cpp/src/db/Status.h b/cpp/src/db/Status.h index f45c9f6bd1..4db2b4c6e0 100644 --- a/cpp/src/db/Status.h +++ b/cpp/src/db/Status.h @@ -21,6 +21,9 @@ public: static Status NotFound(const std::string& msg, const std::string& msg2="") { return Status(kNotFound, msg, msg2); } + static Status Error(const std::string& msg, const std::string& msg2="") { + return Status(kError, msg, msg2); + } static Status InvalidDBPath(const std::string& msg, const std::string& msg2="") { return Status(kInvalidDBPath, msg, msg2); @@ -35,6 +38,7 @@ public: bool ok() const { return state_ == nullptr; } bool IsNotFound() const { return code() == kNotFound; } + bool IsError() const { return code() == kError; } bool IsInvalidDBPath() const { return code() == kInvalidDBPath; } bool IsGroupError() const { return code() == kGroupError; } @@ -48,6 +52,7 @@ private: enum Code { kOK = 0, kNotFound, + kError, kInvalidDBPath, kGroupError, From 713d1ec6b798d697accd1f5a7a27885e406cf66a Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Tue, 30 Apr 2019 15:50:24 +0800 Subject: [PATCH 4/6] feat(db): add cache Former-commit-id: 2dd8078d8df484ddac54379bc6c84c015c1fc530 --- cpp/src/db/FaissSerializer.cpp | 7 +++++++ cpp/src/db/FaissSerializer.h | 2 ++ cpp/src/db/Serializer.h | 2 ++ 3 files changed, 11 insertions(+) diff --git a/cpp/src/db/FaissSerializer.cpp b/cpp/src/db/FaissSerializer.cpp index fee7750469..b2abb3a19c 100644 --- a/cpp/src/db/FaissSerializer.cpp +++ b/cpp/src/db/FaissSerializer.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "FaissSerializer.h" @@ -33,6 +34,12 @@ Status FaissSerializer::Serialize() { return Status::OK(); } +Status FaissSerializer::Cache() { + zilliz::vecwise::cache::CpuCacheMgr::GetInstance( + )->InsertItem(location_, std::make_shared(pIndex_)); + + return Status::OK(); +} } // namespace engine } // namespace vecwise diff --git a/cpp/src/db/FaissSerializer.h b/cpp/src/db/FaissSerializer.h index a56779996e..d3a255a0bb 100644 --- a/cpp/src/db/FaissSerializer.h +++ b/cpp/src/db/FaissSerializer.h @@ -24,6 +24,8 @@ public: virtual Status Serialize() override; + virtual Status Cache() override; + protected: std::shared_ptr pIndex_; std::string location_; diff --git a/cpp/src/db/Serializer.h b/cpp/src/db/Serializer.h index cb2891be2e..dcb32e0b05 100644 --- a/cpp/src/db/Serializer.h +++ b/cpp/src/db/Serializer.h @@ -22,6 +22,8 @@ public: virtual Status Serialize() = 0; + virtual Status Cache() = 0; + virtual ~Serializer() {} }; From 204c68879cc72a0175287a6589bbb546b834dea8 Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Tue, 30 Apr 2019 15:58:00 +0800 Subject: [PATCH 5/6] refactor(db): replace faiss with serializer wrapper Former-commit-id: 62b5a7d8353c2b0ef4017f43d03d1c6944685400 --- cpp/src/db/MemManager.cpp | 23 +++++++---------------- cpp/src/db/MemManager.h | 8 +++----- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/cpp/src/db/MemManager.cpp b/cpp/src/db/MemManager.cpp index e3c9407d86..88f1df7122 100644 --- a/cpp/src/db/MemManager.cpp +++ b/cpp/src/db/MemManager.cpp @@ -1,14 +1,12 @@ -#include #include #include #include -#include -#include #include #include "MemManager.h" #include "Meta.h" +#include "FaissSerializer.h" namespace zilliz { @@ -21,43 +19,36 @@ MemVectors::MemVectors(const std::shared_ptr& meta_ptr, options_(options), schema_(schema), _pIdGenerator(new SimpleIDGenerator()), - pIndex_(faiss::index_factory(schema_.dimension, "IDMap,Flat")) { + pSerializer_(new FaissSerializer(schema_.dimension, schema_.location)) { } void MemVectors::add(size_t n_, const float* vectors_, IDNumbers& vector_ids_) { _pIdGenerator->getNextIDNumbers(n_, vector_ids_); - pIndex_->add_with_ids(n_, vectors_, &vector_ids_[0]); + pSerializer_->AddWithIds(n_, vectors_, vector_ids_.data()); for(auto i=0 ; intotal; + return pSerializer_->Count(); } size_t MemVectors::approximate_size() const { - return total() * schema_.dimension; + return pSerializer_->Size(); } Status MemVectors::serialize(std::string& group_id) { - /* std::stringstream ss; */ - /* ss << "/tmp/test/" << _pIdGenerator->getNextIDNumber(); */ - /* faiss::write_index(pIndex_, ss.str().c_str()); */ - /* std::cout << pIndex_->ntotal << std::endl; */ - /* std::cout << _file_location << std::endl; */ - /* faiss::write_index(pIndex_, _file_location.c_str()); */ group_id = schema_.group_id; auto rows = approximate_size(); - write_index(pIndex_.get(), schema_.location.c_str()); + pSerializer_->Serialize(); schema_.rows = rows; schema_.file_type = (rows >= options_.index_trigger_size) ? meta::GroupFileSchema::TO_INDEX : meta::GroupFileSchema::RAW; auto status = pMeta_->update_group_file(schema_); - zilliz::vecwise::cache::CpuCacheMgr::GetInstance( - )->InsertItem(schema_.location, std::make_shared(pIndex_)); + pSerializer_->Cache(); return status; } diff --git a/cpp/src/db/MemManager.h b/cpp/src/db/MemManager.h index 077e045286..e29b09817c 100644 --- a/cpp/src/db/MemManager.h +++ b/cpp/src/db/MemManager.h @@ -10,10 +10,6 @@ #include "Status.h" #include "Meta.h" -namespace faiss { - class Index; -} - namespace zilliz { namespace vecwise { @@ -23,6 +19,8 @@ namespace meta { class Meta; } +class Serializer; + class MemVectors { public: explicit MemVectors(const std::shared_ptr&, @@ -49,7 +47,7 @@ private: Options options_; meta::GroupFileSchema schema_; IDGenerator* _pIdGenerator; - std::shared_ptr pIndex_; + std::shared_ptr pSerializer_; }; // MemVectors From 7f000e7979774c0be8869002880441b5615246c4 Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Tue, 30 Apr 2019 16:07:33 +0800 Subject: [PATCH 6/6] refactor(db): rename Former-commit-id: 60c982ed5e500d958182bbc0816731d9423b509a --- cpp/src/db/{Serializer.cpp => ExecutionEngine.cpp} | 4 ++-- cpp/src/db/{Serializer.h => ExecutionEngine.h} | 4 ++-- ...aissSerializer.cpp => FaissExecutionEngine.cpp} | 14 +++++++------- .../{FaissSerializer.h => FaissExecutionEngine.h} | 6 +++--- cpp/src/db/MemManager.cpp | 14 +++++++------- cpp/src/db/MemManager.h | 4 ++-- 6 files changed, 23 insertions(+), 23 deletions(-) rename cpp/src/db/{Serializer.cpp => ExecutionEngine.cpp} (77%) rename cpp/src/db/{Serializer.h => ExecutionEngine.h} (90%) rename cpp/src/db/{FaissSerializer.cpp => FaissExecutionEngine.cpp} (66%) rename cpp/src/db/{FaissSerializer.h => FaissExecutionEngine.h} (78%) diff --git a/cpp/src/db/Serializer.cpp b/cpp/src/db/ExecutionEngine.cpp similarity index 77% rename from cpp/src/db/Serializer.cpp rename to cpp/src/db/ExecutionEngine.cpp index 595cd4731c..9aea3f7ea1 100644 --- a/cpp/src/db/Serializer.cpp +++ b/cpp/src/db/ExecutionEngine.cpp @@ -1,11 +1,11 @@ #include -#include "Serializer.h" +#include "ExecutionEngine.h" namespace zilliz { namespace vecwise { namespace engine { -Status Serializer::AddWithIds(const std::vector& vectors, const std::vector& vector_ids) { +Status ExecutionEngine::AddWithIds(const std::vector& vectors, const std::vector& vector_ids) { long n1 = (long)vectors.size(); long n2 = (long)vector_ids.size(); if (n1 != n2) { diff --git a/cpp/src/db/Serializer.h b/cpp/src/db/ExecutionEngine.h similarity index 90% rename from cpp/src/db/Serializer.h rename to cpp/src/db/ExecutionEngine.h index dcb32e0b05..4b08149f45 100644 --- a/cpp/src/db/Serializer.h +++ b/cpp/src/db/ExecutionEngine.h @@ -8,7 +8,7 @@ namespace zilliz { namespace vecwise { namespace engine { -class Serializer { +class ExecutionEngine { public: Status AddWithIds(const std::vector& vectors, @@ -24,7 +24,7 @@ public: virtual Status Cache() = 0; - virtual ~Serializer() {} + virtual ~ExecutionEngine() {} }; diff --git a/cpp/src/db/FaissSerializer.cpp b/cpp/src/db/FaissExecutionEngine.cpp similarity index 66% rename from cpp/src/db/FaissSerializer.cpp rename to cpp/src/db/FaissExecutionEngine.cpp index b2abb3a19c..ffa3ef3c24 100644 --- a/cpp/src/db/FaissSerializer.cpp +++ b/cpp/src/db/FaissExecutionEngine.cpp @@ -3,7 +3,7 @@ #include #include -#include "FaissSerializer.h" +#include "FaissExecutionEngine.h" namespace zilliz { namespace vecwise { @@ -11,30 +11,30 @@ namespace engine { const std::string IndexType = "IDMap,Flat"; -FaissSerializer::FaissSerializer(uint16_t dimension, const std::string& location) +FaissExecutionEngine::FaissExecutionEngine(uint16_t dimension, const std::string& location) : pIndex_(faiss::index_factory(dimension, IndexType.c_str())), location_(location) { } -Status FaissSerializer::AddWithIds(long n, const float *xdata, const long *xids) { +Status FaissExecutionEngine::AddWithIds(long n, const float *xdata, const long *xids) { pIndex_->add_with_ids(n, xdata, xids); return Status::OK(); } -size_t FaissSerializer::Count() const { +size_t FaissExecutionEngine::Count() const { return (size_t)(pIndex_->ntotal); } -size_t FaissSerializer::Size() const { +size_t FaissExecutionEngine::Size() const { return (size_t)(Count() * pIndex_->d); } -Status FaissSerializer::Serialize() { +Status FaissExecutionEngine::Serialize() { write_index(pIndex_.get(), location_.c_str()); return Status::OK(); } -Status FaissSerializer::Cache() { +Status FaissExecutionEngine::Cache() { zilliz::vecwise::cache::CpuCacheMgr::GetInstance( )->InsertItem(location_, std::make_shared(pIndex_)); diff --git a/cpp/src/db/FaissSerializer.h b/cpp/src/db/FaissExecutionEngine.h similarity index 78% rename from cpp/src/db/FaissSerializer.h rename to cpp/src/db/FaissExecutionEngine.h index d3a255a0bb..bb1f5a4770 100644 --- a/cpp/src/db/FaissSerializer.h +++ b/cpp/src/db/FaissExecutionEngine.h @@ -3,7 +3,7 @@ #include #include -#include "Serializer.h" +#include "ExecutionEngine.h" namespace faiss { class Index; @@ -13,9 +13,9 @@ namespace zilliz { namespace vecwise { namespace engine { -class FaissSerializer : public Serializer { +class FaissExecutionEngine : public ExecutionEngine { public: - FaissSerializer(uint16_t dimension, const std::string& location); + FaissExecutionEngine(uint16_t dimension, const std::string& location); virtual Status AddWithIds(long n, const float *xdata, const long *xids) override; virtual size_t Count() const override; diff --git a/cpp/src/db/MemManager.cpp b/cpp/src/db/MemManager.cpp index 88f1df7122..10ec582935 100644 --- a/cpp/src/db/MemManager.cpp +++ b/cpp/src/db/MemManager.cpp @@ -6,7 +6,7 @@ #include "MemManager.h" #include "Meta.h" -#include "FaissSerializer.h" +#include "FaissExecutionEngine.h" namespace zilliz { @@ -19,36 +19,36 @@ MemVectors::MemVectors(const std::shared_ptr& meta_ptr, options_(options), schema_(schema), _pIdGenerator(new SimpleIDGenerator()), - pSerializer_(new FaissSerializer(schema_.dimension, schema_.location)) { + pEE_(new FaissExecutionEngine(schema_.dimension, schema_.location)) { } void MemVectors::add(size_t n_, const float* vectors_, IDNumbers& vector_ids_) { _pIdGenerator->getNextIDNumbers(n_, vector_ids_); - pSerializer_->AddWithIds(n_, vectors_, vector_ids_.data()); + pEE_->AddWithIds(n_, vectors_, vector_ids_.data()); for(auto i=0 ; iCount(); + return pEE_->Count(); } size_t MemVectors::approximate_size() const { - return pSerializer_->Size(); + return pEE_->Size(); } Status MemVectors::serialize(std::string& group_id) { group_id = schema_.group_id; auto rows = approximate_size(); - pSerializer_->Serialize(); + pEE_->Serialize(); schema_.rows = rows; schema_.file_type = (rows >= options_.index_trigger_size) ? meta::GroupFileSchema::TO_INDEX : meta::GroupFileSchema::RAW; auto status = pMeta_->update_group_file(schema_); - pSerializer_->Cache(); + pEE_->Cache(); return status; } diff --git a/cpp/src/db/MemManager.h b/cpp/src/db/MemManager.h index e29b09817c..0a298bf22d 100644 --- a/cpp/src/db/MemManager.h +++ b/cpp/src/db/MemManager.h @@ -19,7 +19,7 @@ namespace meta { class Meta; } -class Serializer; +class ExecutionEngine; class MemVectors { public: @@ -47,7 +47,7 @@ private: Options options_; meta::GroupFileSchema schema_; IDGenerator* _pIdGenerator; - std::shared_ptr pSerializer_; + std::shared_ptr pEE_; }; // MemVectors