From 4b8a72f9aee179733faf82fdfc73bc12564a1090 Mon Sep 17 00:00:00 2001 From: fishpenguin Date: Fri, 29 Nov 2019 11:38:27 +0800 Subject: [PATCH 1/9] NSG build failed using GPU-edition if set gpu_enable false --- CHANGELOG.md | 1 + .../index/knowhere/knowhere/index/vector_index/IndexNSG.cpp | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba7561c333..12e7e01cbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ Please mark all change in change log and use the ticket from JIRA. - \#533 - NSG build failed with MetricType Inner Product - \#543 - client raise exception in shards when search results is empty - \#545 - Avoid dead circle of build index thread when error occurs +- \#547 - NSG build failed using GPU-edition if set gpu_enable false - \#552 - Server down during building index_type: IVF_PQ using GPU-edition - \#561 - Milvus server should report exception/error message or terminate on mysql metadata backend error - \#599 - Build index log is incorrect diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp index 16c0b9172f..9f00c82fd4 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp @@ -117,7 +117,13 @@ NSG::Train(const DatasetPtr& dataset, const Config& config) { // TODO(linxj): dev IndexFactory, support more IndexType #ifdef MILVUS_GPU_VERSION + auto temp_resource = FaissGpuResourceMgr::GetInstance().GetRes(build_cfg->gpu_id); +#if temp_resource == nullptr + auto preprocess_index = std::make_shared(); +#else auto preprocess_index = std::make_shared(build_cfg->gpu_id); +#endif + #else auto preprocess_index = std::make_shared(); #endif From fd304cf4b43344dfe3d0a161b3e79915d81ad2fb Mon Sep 17 00:00:00 2001 From: fishpenguin Date: Fri, 29 Nov 2019 14:52:08 +0800 Subject: [PATCH 2/9] remove #if --- .../knowhere/index/vector_index/IndexNSG.cpp | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp index 9f00c82fd4..71660551c1 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp @@ -116,23 +116,29 @@ NSG::Train(const DatasetPtr& dataset, const Config& config) { } // TODO(linxj): dev IndexFactory, support more IndexType + bool use_gpu = false; #ifdef MILVUS_GPU_VERSION + use_gpu = true; auto temp_resource = FaissGpuResourceMgr::GetInstance().GetRes(build_cfg->gpu_id); -#if temp_resource == nullptr - auto preprocess_index = std::make_shared(); -#else - auto preprocess_index = std::make_shared(build_cfg->gpu_id); + if (temp_resource == nullptr) + use_gpu = false; #endif - -#else - auto preprocess_index = std::make_shared(); -#endif - auto model = preprocess_index->Train(dataset, config); - preprocess_index->set_index_model(model); - preprocess_index->AddWithoutIds(dataset, config); - Graph knng; - preprocess_index->GenGraph(build_cfg->knng, knng, dataset, config); + if (use_gpu) { + auto preprocess_index = std::make_shared(build_cfg->gpu_id); + auto model = preprocess_index->Train(dataset, config); + preprocess_index->set_index_model(model); + preprocess_index->AddWithoutIds(dataset, config); + + preprocess_index->GenGraph(build_cfg->knng, knng, dataset, config); + } else { + auto preprocess_index = std::make_shared(); + auto model = preprocess_index->Train(dataset, config); + preprocess_index->set_index_model(model); + preprocess_index->AddWithoutIds(dataset, config); + + preprocess_index->GenGraph(build_cfg->knng, knng, dataset, config); + } algo::BuildParams b_params; b_params.candidate_pool_size = build_cfg->candidate_pool_size; From 6fcd2a13da5374ab48d43cf579be8519c40b0c40 Mon Sep 17 00:00:00 2001 From: fishpenguin Date: Fri, 29 Nov 2019 15:57:42 +0800 Subject: [PATCH 3/9] fix CPU version bug --- .../knowhere/index/vector_index/IndexNSG.cpp | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp index 71660551c1..8cd98a74d7 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp @@ -116,29 +116,29 @@ NSG::Train(const DatasetPtr& dataset, const Config& config) { } // TODO(linxj): dev IndexFactory, support more IndexType - bool use_gpu = false; -#ifdef MILVUS_GPU_VERSION - use_gpu = true; - auto temp_resource = FaissGpuResourceMgr::GetInstance().GetRes(build_cfg->gpu_id); - if (temp_resource == nullptr) - use_gpu = false; -#endif Graph knng; - if (use_gpu) { - auto preprocess_index = std::make_shared(build_cfg->gpu_id); - auto model = preprocess_index->Train(dataset, config); - preprocess_index->set_index_model(model); - preprocess_index->AddWithoutIds(dataset, config); - - preprocess_index->GenGraph(build_cfg->knng, knng, dataset, config); - } else { +#ifdef MILVUS_GPU_VERSION + auto temp_resource = FaissGpuResourceMgr::GetInstance().GetRes(build_cfg->gpu_id); + if (temp_resource == nullptr) { auto preprocess_index = std::make_shared(); auto model = preprocess_index->Train(dataset, config); preprocess_index->set_index_model(model); preprocess_index->AddWithoutIds(dataset, config); - + preprocess_index->GenGraph(build_cfg->knng, knng, dataset, config); + } else { + auto preprocess_index = std::make_shared(build_cfg->gpu_id); + auto model = preprocess_index->Train(dataset, config); + preprocess_index->set_index_model(model); + preprocess_index->AddWithoutIds(dataset, config); preprocess_index->GenGraph(build_cfg->knng, knng, dataset, config); } +#else + auto preprocess_index = std::make_shared(); + auto model = preprocess_index->Train(dataset, config); + preprocess_index->set_index_model(model); + preprocess_index->AddWithoutIds(dataset, config); + preprocess_index->GenGraph(build_cfg->knng, knng, dataset, config); +#endif algo::BuildParams b_params; b_params.candidate_pool_size = build_cfg->candidate_pool_size; From fdfb3979181580e3c4a82a52ed7f65f1b7c14c9c Mon Sep 17 00:00:00 2001 From: fishpenguin Date: Fri, 29 Nov 2019 20:17:37 +0800 Subject: [PATCH 4/9] fix test_nsg failed bug --- core/src/db/engine/ExecutionEngineImpl.cpp | 3 +++ .../index/knowhere/knowhere/index/vector_index/IndexNSG.cpp | 3 +-- core/src/wrapper/ConfAdapter.cpp | 2 -- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp index 5a3d6e5e2a..ba8a4c34f9 100644 --- a/core/src/db/engine/ExecutionEngineImpl.cpp +++ b/core/src/db/engine/ExecutionEngineImpl.cpp @@ -611,6 +611,9 @@ ExecutionEngineImpl::Init() { server::Config& config = server::Config::GetInstance(); std::vector gpu_ids; Status s = config.GetGpuResourceConfigBuildIndexResources(gpu_ids); + if (!s.ok()) { + gpu_num_ = knowhere::INVALID_VALUE; + } for (auto id : gpu_ids) { if (gpu_num_ == id) { return Status::OK(); diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp index 8cd98a74d7..370df76b9b 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp @@ -118,8 +118,7 @@ NSG::Train(const DatasetPtr& dataset, const Config& config) { // TODO(linxj): dev IndexFactory, support more IndexType Graph knng; #ifdef MILVUS_GPU_VERSION - auto temp_resource = FaissGpuResourceMgr::GetInstance().GetRes(build_cfg->gpu_id); - if (temp_resource == nullptr) { + if (build_cfg->gpu_id == knowhere::INVALID_VALUE) { auto preprocess_index = std::make_shared(); auto model = preprocess_index->Train(dataset, config); preprocess_index->set_index_model(model); diff --git a/core/src/wrapper/ConfAdapter.cpp b/core/src/wrapper/ConfAdapter.cpp index 7644e77ef5..9ee2f060b1 100644 --- a/core/src/wrapper/ConfAdapter.cpp +++ b/core/src/wrapper/ConfAdapter.cpp @@ -39,8 +39,6 @@ void ConfAdapter::MatchBase(knowhere::Config conf) { if (conf->metric_type == knowhere::DEFAULT_TYPE) conf->metric_type = knowhere::METRICTYPE::L2; - if (conf->gpu_id == knowhere::INVALID_VALUE) - conf->gpu_id = 0; } knowhere::Config From f45df1a2419903e391b8d37f0dfc8a2a99da42cd Mon Sep 17 00:00:00 2001 From: fishpenguin Date: Sat, 30 Nov 2019 09:28:24 +0800 Subject: [PATCH 5/9] Remove src/grpc/README.md --- core/src/grpc/README.md | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 core/src/grpc/README.md diff --git a/core/src/grpc/README.md b/core/src/grpc/README.md deleted file mode 100644 index 6a3fe1157c..0000000000 --- a/core/src/grpc/README.md +++ /dev/null @@ -1,6 +0,0 @@ -We manually change two APIs in "milvus.pb.h": - add_vector_data() - add_row_id_array() - add_ids() - add_distances() -If proto files need be generated again, remember to re-change above APIs. \ No newline at end of file From dcbe1a0c3f0ed89d0f7439eceb4c2afcf86706d0 Mon Sep 17 00:00:00 2001 From: "yudong.cai" Date: Sat, 30 Nov 2019 11:26:40 +0800 Subject: [PATCH 6/9] #579 when gpu resources disabled, use cpu index instead --- CHANGELOG.md | 1 + core/src/db/engine/ExecutionEngineImpl.cpp | 24 ++++++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e83f996780..90a1358592 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ Please mark all change in change log and use the ticket from JIRA. - \#545 - Avoid dead circle of build index thread when error occurs - \#552 - Server down during building index_type: IVF_PQ using GPU-edition - \#561 - Milvus server should report exception/error message or terminate on mysql metadata backend error +- \#579 - Build index hang in GPU version when gpu_resources disabled - \#599 - Build index log is incorrect - \#602 - Optimizer specify wrong gpu_id - \#606 - No log generated during building index with CPU diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp index 9c6a2a0f33..9f2b48642d 100644 --- a/core/src/db/engine/ExecutionEngineImpl.cpp +++ b/core/src/db/engine/ExecutionEngineImpl.cpp @@ -86,6 +86,9 @@ ExecutionEngineImpl::ExecutionEngineImpl(VecIndexPtr index, const std::string& l VecIndexPtr ExecutionEngineImpl::CreatetVecIndex(EngineType type) { + server::Config& config = server::Config::GetInstance(); + bool gpu_resource_enable = true; + config.GetGpuResourceConfigEnable(gpu_resource_enable); std::shared_ptr index; switch (type) { case EngineType::FAISS_IDMAP: { @@ -94,18 +97,20 @@ ExecutionEngineImpl::CreatetVecIndex(EngineType type) { } case EngineType::FAISS_IVFFLAT: { #ifdef MILVUS_GPU_VERSION - index = GetVecIndexFactory(IndexType::FAISS_IVFFLAT_MIX); -#else - index = GetVecIndexFactory(IndexType::FAISS_IVFFLAT_CPU); + if (gpu_resource_enable) + index = GetVecIndexFactory(IndexType::FAISS_IVFFLAT_MIX); + else #endif + index = GetVecIndexFactory(IndexType::FAISS_IVFFLAT_CPU); break; } case EngineType::FAISS_IVFSQ8: { #ifdef MILVUS_GPU_VERSION - index = GetVecIndexFactory(IndexType::FAISS_IVFSQ8_MIX); -#else - index = GetVecIndexFactory(IndexType::FAISS_IVFSQ8_CPU); + if (gpu_resource_enable) + index = GetVecIndexFactory(IndexType::FAISS_IVFSQ8_MIX); + else #endif + index = GetVecIndexFactory(IndexType::FAISS_IVFSQ8_CPU); break; } case EngineType::NSG_MIX: { @@ -120,10 +125,11 @@ ExecutionEngineImpl::CreatetVecIndex(EngineType type) { #endif case EngineType::FAISS_PQ: { #ifdef MILVUS_GPU_VERSION - index = GetVecIndexFactory(IndexType::FAISS_IVFPQ_MIX); -#else - index = GetVecIndexFactory(IndexType::FAISS_IVFPQ_CPU); + if (gpu_resource_enable) + index = GetVecIndexFactory(IndexType::FAISS_IVFPQ_MIX); + else #endif + index = GetVecIndexFactory(IndexType::FAISS_IVFPQ_CPU); break; } case EngineType::SPTAG_KDT: { From 525b6df0b6d67ded1384efe668687076cff0a4b6 Mon Sep 17 00:00:00 2001 From: "yudong.cai" Date: Sat, 30 Nov 2019 12:00:47 +0800 Subject: [PATCH 7/9] #579 update index build error message --- core/src/db/DBImpl.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/core/src/db/DBImpl.cpp b/core/src/db/DBImpl.cpp index 67769717c4..e2099739ed 100644 --- a/core/src/db/DBImpl.cpp +++ b/core/src/db/DBImpl.cpp @@ -1033,11 +1033,7 @@ DBImpl::BuildTableIndexRecursively(const std::string& table_id, const TableIndex if (!failed_files.empty()) { std::string msg = "Failed to build index for " + std::to_string(failed_files.size()) + ((failed_files.size() == 1) ? " file" : " files"); -#ifdef MILVUS_GPU_VERSION - msg += ", file size is too large or gpu memory is not enough."; -#else msg += ", please double check index parameters."; -#endif return Status(DB_ERROR, msg); } From de440acaa02ea1366be99ae46babb810b4f6dfb9 Mon Sep 17 00:00:00 2001 From: "yudong.cai" Date: Sat, 30 Nov 2019 14:03:46 +0800 Subject: [PATCH 8/9] #579 fix CPU version build error --- core/src/db/engine/ExecutionEngineImpl.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp index 9f2b48642d..1189d35b94 100644 --- a/core/src/db/engine/ExecutionEngineImpl.cpp +++ b/core/src/db/engine/ExecutionEngineImpl.cpp @@ -86,9 +86,11 @@ ExecutionEngineImpl::ExecutionEngineImpl(VecIndexPtr index, const std::string& l VecIndexPtr ExecutionEngineImpl::CreatetVecIndex(EngineType type) { +#ifdef MILVUS_GPU_VERSION server::Config& config = server::Config::GetInstance(); bool gpu_resource_enable = true; config.GetGpuResourceConfigEnable(gpu_resource_enable); +#endif std::shared_ptr index; switch (type) { case EngineType::FAISS_IDMAP: { From 10d50d2fb768ae731de6511018aac60a9fd4083d Mon Sep 17 00:00:00 2001 From: fishpenguin Date: Sat, 30 Nov 2019 14:57:35 +0800 Subject: [PATCH 9/9] IVF_PQ search on CPUs when using GPU-version --- core/src/scheduler/SchedInst.h | 4 + .../scheduler/optimizer/FaissIVFPQPass.cpp | 74 +++++++++++++++++++ core/src/scheduler/optimizer/FaissIVFPQPass.h | 58 +++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 core/src/scheduler/optimizer/FaissIVFPQPass.cpp create mode 100644 core/src/scheduler/optimizer/FaissIVFPQPass.h diff --git a/core/src/scheduler/SchedInst.h b/core/src/scheduler/SchedInst.h index 1e8a7acf2e..6cca377033 100644 --- a/core/src/scheduler/SchedInst.h +++ b/core/src/scheduler/SchedInst.h @@ -25,6 +25,7 @@ #include "optimizer/BuildIndexPass.h" #include "optimizer/FaissFlatPass.h" #include "optimizer/FaissIVFFlatPass.h" +#include "optimizer/FaissIVFPQPass.h" #include "optimizer/FaissIVFSQ8HPass.h" #include "optimizer/FaissIVFSQ8Pass.h" #include "optimizer/FallbackPass.h" @@ -129,7 +130,10 @@ class OptimizerInst { pass_list.push_back(std::make_shared()); pass_list.push_back(std::make_shared()); pass_list.push_back(std::make_shared()); +#ifdef CUSTOMIZATION pass_list.push_back(std::make_shared()); +#endif + pass_list.push_back(std::make_shared()); } #endif pass_list.push_back(std::make_shared()); diff --git a/core/src/scheduler/optimizer/FaissIVFPQPass.cpp b/core/src/scheduler/optimizer/FaissIVFPQPass.cpp new file mode 100644 index 0000000000..f97fec63b4 --- /dev/null +++ b/core/src/scheduler/optimizer/FaissIVFPQPass.cpp @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifdef MILVUS_GPU_VERSION +#include "scheduler/optimizer/FaissIVFPQPass.h" +#include "cache/GpuCacheMgr.h" +#include "scheduler/SchedInst.h" +#include "scheduler/Utils.h" +#include "scheduler/task/SearchTask.h" +#include "scheduler/tasklabel/SpecResLabel.h" +#include "server/Config.h" +#include "utils/Log.h" + +namespace milvus { +namespace scheduler { + +void +FaissIVFPQPass::Init() { +#ifdef MILVUS_GPU_VERSION + server::Config& config = server::Config::GetInstance(); + Status s = config.GetEngineConfigGpuSearchThreshold(threshold_); + if (!s.ok()) { + threshold_ = std::numeric_limits::max(); + } + s = config.GetGpuResourceConfigSearchResources(gpus); + if (!s.ok()) { + throw; + } +#endif +} + +bool +FaissIVFPQPass::Run(const TaskPtr& task) { + if (task->Type() != TaskType::SearchTask) { + return false; + } + + auto search_task = std::static_pointer_cast(task); + if (search_task->file_->engine_type_ != (int)engine::EngineType::FAISS_PQ) { + return false; + } + + auto search_job = std::static_pointer_cast(search_task->job_.lock()); + ResourcePtr res_ptr; + if (search_job->nq() < threshold_) { + SERVER_LOG_DEBUG << "FaissIVFPQPass: nq < gpu_search_threshold, specify cpu to search!"; + res_ptr = ResMgrInst::GetInstance()->GetResource("cpu"); + } else { + auto best_device_id = count_ % gpus.size(); + SERVER_LOG_DEBUG << "FaissIVFPQPass: nq > gpu_search_threshold, specify gpu" << best_device_id << " to search!"; + count_++; + res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, gpus[best_device_id]); + } + auto label = std::make_shared(res_ptr); + task->label() = label; + return true; +} + +} // namespace scheduler +} // namespace milvus +#endif diff --git a/core/src/scheduler/optimizer/FaissIVFPQPass.h b/core/src/scheduler/optimizer/FaissIVFPQPass.h new file mode 100644 index 0000000000..9225f84b7c --- /dev/null +++ b/core/src/scheduler/optimizer/FaissIVFPQPass.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifdef MILVUS_GPU_VERSION +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Pass.h" + +namespace milvus { +namespace scheduler { + +class FaissIVFPQPass : public Pass { + public: + FaissIVFPQPass() = default; + + public: + void + Init() override; + + bool + Run(const TaskPtr& task) override; + + private: + int64_t threshold_ = std::numeric_limits::max(); + int64_t count_ = 0; + std::vector gpus; +}; + +using FaissIVFPQPassPtr = std::shared_ptr; + +} // namespace scheduler +} // namespace milvus +#endif