diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp
index 862c1026d2..9c411deba1 100644
--- a/core/src/db/engine/ExecutionEngineImpl.cpp
+++ b/core/src/db/engine/ExecutionEngineImpl.cpp
@@ -22,10 +22,7 @@
 #include "utils/CommonUtil.h"
 #include "utils/Exception.h"
 #include "utils/Log.h"
-
 #include "knowhere/common/Config.h"
-#include "knowhere/common/Exception.h"
-#include "knowhere/index/vector_index/IndexIVFSQHybrid.h"
 #include "scheduler/Utils.h"
 #include "server/Config.h"
 #include "wrapper/ConfAdapter.h"
@@ -249,6 +246,56 @@ ExecutionEngineImpl::Load(bool to_cache) {
 Status
 ExecutionEngineImpl::CopyToGpu(uint64_t device_id, bool hybrid) {
     if (hybrid) {
+#if 1
+        const std::string key = location_ + ".quantizer";
+        std::vector<uint64_t> gpus = scheduler::get_gpu_pool();
+
+        const int64_t NOT_FOUND = -1;
+        int64_t device_id = NOT_FOUND;
+
+        // cache hit
+        {
+            knowhere::QuantizerPtr quantizer = nullptr;
+
+            for (auto& gpu : gpus) {
+                auto cache = cache::GpuCacheMgr::GetInstance(gpu);
+                if (auto cached_quantizer = cache->GetIndex(key)) {
+                    device_id = gpu;
+                    quantizer = std::static_pointer_cast<CachedQuantizer>(cached_quantizer)->Data();
+                }
+            }
+
+            if (device_id != NOT_FOUND) {
+                // cache hit
+                auto config = std::make_shared<knowhere::QuantizerCfg>();
+                config->gpu_id = device_id;
+                config->mode = 2;
+                auto new_index = index_->LoadData(quantizer, config);
+                index_ = new_index;
+            }
+        }
+
+        if (device_id == NOT_FOUND) {
+            // cache miss
+            std::vector<int64_t> all_free_mem;
+            for (auto& gpu : gpus) {
+                auto cache = cache::GpuCacheMgr::GetInstance(gpu);
+                auto free_mem = cache->CacheCapacity() - cache->CacheUsage();
+                all_free_mem.push_back(free_mem);
+            }
+
+            auto max_e = std::max_element(all_free_mem.begin(), all_free_mem.end());
+            auto best_index = std::distance(all_free_mem.begin(), max_e);
+            device_id = gpus[best_index];
+
+            auto pair = index_->CopyToGpuWithQuantizer(device_id);
+            index_ = pair.first;
+
+            // cache
+            auto cached_quantizer = std::make_shared<CachedQuantizer>(pair.second);
+            cache::GpuCacheMgr::GetInstance(device_id)->InsertItem(key, cached_quantizer);
+        }
+#endif
         return Status::OK();
     }
 
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/FaissBaseIndex.h b/core/src/index/knowhere/knowhere/index/vector_index/FaissBaseIndex.h
index f3fceebb88..359af97d90 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/FaissBaseIndex.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/FaissBaseIndex.h
@@ -38,7 +38,7 @@ class FaissBaseIndex {
     virtual void
     SealImpl();
 
- protected:
+ public:
     std::shared_ptr<faiss::Index> index_ = nullptr;
 };
 
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.cpp
index a5e8f90f34..65938e1630 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.cpp
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuIndexFlat.h>
+#include <memory>
+
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/index_io.h>
-#include <memory>
+#include <faiss/gpu/GpuCloner.h>
 
 #include "knowhere/adapter/VectorAdapter.h"
 #include "knowhere/common/Exception.h"
@@ -130,13 +130,12 @@ void
 GPUIVF::search_impl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& cfg) {
     std::lock_guard<std::mutex> lk(mutex_);
 
-    // TODO(linxj): gpu index support GenParams
     if (auto device_index = std::dynamic_pointer_cast<faiss::gpu::GpuIndexIVF>(index_)) {
         auto search_cfg = std::dynamic_pointer_cast<IVFCfg>(cfg);
-        device_index->setNumProbes(search_cfg->nprobe);
+        device_index->nprobe = search_cfg->nprobe;
+//        assert(device_index->getNumProbes() == search_cfg->nprobe);
 
         {
-            // TODO(linxj): allocate gpu mem
             ResScope rs(res_, gpu_id_);
             device_index->search(n, (float*)data, k, distances, labels);
         }
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFPQ.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFPQ.cpp
index 213141b3ac..9ba8dd0456 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFPQ.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFPQ.cpp
@@ -16,8 +16,10 @@
 // under the License.
 
 #include <faiss/IndexIVFPQ.h>
-#include <faiss/gpu/GpuAutoTune.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/index_factory.h>
+#include <faiss/gpu/GpuCloner.h>
+
 #include <memory>
 
 #include "knowhere/adapter/VectorAdapter.h"
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFSQ.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFSQ.cpp
index 5e1f5226f2..fff27cd7db 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFSQ.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFSQ.cpp
@@ -15,9 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <faiss/gpu/GpuAutoTune.h>
+#include <faiss/index_factory.h>
+#include <faiss/gpu/GpuCloner.h>
+
 #include <memory>
-#include <utility>
 
 #include "knowhere/adapter/VectorAdapter.h"
 #include "knowhere/common/Exception.h"
@@ -71,13 +72,4 @@ GPUIVFSQ::CopyGpuToCpu(const Config& config) {
     return std::make_shared<IVFSQ>(new_index);
 }
 
-void
-GPUIVFSQ::search_impl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& cfg) {
-#ifdef CUSTOMIZATION
-    GPUIVF::search_impl(n, data, k, distances, labels, cfg);
-#else
-    IVF::search_impl(n, data, k, distances, labels, cfg);
-#endif
-}
-
 }  // namespace knowhere
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFSQ.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFSQ.h
index 7332bce691..ed8013d77f 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFSQ.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVFSQ.h
@@ -38,10 +38,6 @@ class GPUIVFSQ : public GPUIVF {
 
     VectorIndexPtr
     CopyGpuToCpu(const Config& config) override;
-
- protected:
-    void
-    search_impl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& cfg) override;
 };
 
 }  // namespace knowhere
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.cpp
index 2371591b5c..643bb16076 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.cpp
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <faiss/AutoTune.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/MetaIndexes.h>
-#include <faiss/gpu/GpuAutoTune.h>
 #include <faiss/index_io.h>
+#include <faiss/index_factory.h>
+#include <faiss/gpu/GpuCloner.h>
+
 #include <vector>
 
 #include "knowhere/adapter/VectorAdapter.h"
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.cpp
index 0c4856f2b6..02708ff5d7 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.cpp
@@ -15,15 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <faiss/AutoTune.h>
-#include <faiss/AuxIndexStructures.h>
+#include <faiss/gpu/GpuCloner.h>
 #include <faiss/IVFlib.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/index_io.h>
 #include <memory>
 #include <utility>
 #include <vector>
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.h
index ef9982fa30..e064b6f08c 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.h
@@ -30,7 +30,7 @@ namespace knowhere {
 
 using Graph = std::vector<std::vector<int64_t>>;
 
-class IVF : public VectorIndex, protected FaissBaseIndex {
+class IVF : public VectorIndex, public FaissBaseIndex {
  public:
     IVF() : FaissBaseIndex(nullptr) {
     }
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp
index 063dc63550..80b4c78883 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp
@@ -15,7 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <faiss/gpu/GpuAutoTune.h>
+#include <faiss/index_factory.h>
+#include <faiss/gpu/GpuCloner.h>
 #include <memory>
 
 #include "knowhere/adapter/VectorAdapter.h"
@@ -56,14 +57,7 @@ IVFSQ::CopyCpuToGpu(const int64_t& device_id, const Config& config) {
     if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(device_id)) {
         ResScope rs(res, device_id, false);
 
-#ifdef CUSTOMIZATION
-        faiss::gpu::GpuClonerOptions option;
-        option.allInGpu = true;
-
-        auto gpu_index = faiss::gpu::index_cpu_to_gpu(res->faiss_res.get(), device_id, index_.get(), &option);
-#else
         auto gpu_index = faiss::gpu::index_cpu_to_gpu(res->faiss_res.get(), device_id, index_.get());
-#endif
 
         std::shared_ptr<faiss::Index> device_index;
         device_index.reset(gpu_index);
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQHybrid.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQHybrid.cpp
index fe5bf0990a..af67722266 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQHybrid.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQHybrid.cpp
@@ -17,19 +17,25 @@
 // under the License.
 
 #include "knowhere/index/vector_index/IndexIVFSQHybrid.h"
-#include <utility>
-#include "faiss/AutoTune.h"
-#include "faiss/gpu/GpuAutoTune.h"
-#include "faiss/gpu/GpuIndexIVF.h"
 #include "knowhere/adapter/VectorAdapter.h"
 #include "knowhere/common/Exception.h"
 
+#include <utility>
+
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/index_factory.h>
+#include <faiss/gpu/GpuCloner.h>
+
 namespace knowhere {
 
 #ifdef CUSTOMIZATION
 
+//std::mutex g_mutex;
+
 IndexModelPtr
 IVFSQHybrid::Train(const DatasetPtr& dataset, const Config& config) {
+//    std::lock_guard<std::mutex> lk(g_mutex);
+
     auto build_cfg = std::dynamic_pointer_cast<IVFSQCfg>(config);
     if (build_cfg != nullptr) {
         build_cfg->CheckValid();  // throw exception
@@ -63,23 +69,25 @@ IVFSQHybrid::Train(const DatasetPtr& dataset, const Config& config) {
 
 VectorIndexPtr
 IVFSQHybrid::CopyGpuToCpu(const Config& config) {
+    if (gpu_mode == 0) {
+        return std::make_shared<IVFSQHybrid>(index_);
+    }
     std::lock_guard<std::mutex> lk(mutex_);
 
-    if (auto device_idx = std::dynamic_pointer_cast<faiss::IndexIVF>(index_)) {
         faiss::Index* device_index = index_.get();
         faiss::Index* host_index = faiss::gpu::index_gpu_to_cpu(device_index);
 
         std::shared_ptr<faiss::Index> new_index;
         new_index.reset(host_index);
         return std::make_shared<IVFSQHybrid>(new_index);
-    } else {
-        // TODO(linxj): why? jinhai
-        return std::make_shared<IVFSQHybrid>(index_);
-    }
 }
 
 VectorIndexPtr
 IVFSQHybrid::CopyCpuToGpu(const int64_t& device_id, const Config& config) {
+    if (gpu_mode != 0) {
+        KNOWHERE_THROW_MSG("Not a GpuIndex Type");
+    }
+
     if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(device_id)) {
         ResScope rs(res, device_id, false);
         faiss::gpu::GpuClonerOptions option;
@@ -105,16 +113,26 @@ IVFSQHybrid::LoadImpl(const BinarySet& index_binary) {
     FaissBaseIndex::LoadImpl(index_binary);  // load on cpu
     auto* ivf_index = dynamic_cast<faiss::IndexIVF*>(index_.get());
     ivf_index->backup_quantizer();
+    gpu_mode = 0;
 }
 
 void
 IVFSQHybrid::search_impl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels,
                          const Config& cfg) {
+//    std::lock_guard<std::mutex> lk(g_mutex);
+//    static int64_t search_count;
+//    ++search_count;
+
     if (gpu_mode == 2) {
         GPUIVF::search_impl(n, data, k, distances, labels, cfg);
-    } else if (gpu_mode == 1) {
-        ResScope rs(res_, gpu_id_);
-        IVF::search_impl(n, data, k, distances, labels, cfg);
+//        index_->search(n, (float*)data, k, distances, labels);
+    } else if (gpu_mode == 1) { // hybrid
+        if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(quantizer_gpu_id_)) {
+            ResScope rs(res, quantizer_gpu_id_, true);
+            IVF::search_impl(n, data, k, distances, labels, cfg);
+        } else {
+            KNOWHERE_THROW_MSG("Hybrid Search Error, can't get gpu: " + std::to_string(quantizer_gpu_id_) + "resource");
+        }
     } else if (gpu_mode == 0) {
         IVF::search_impl(n, data, k, distances, labels, cfg);
     }
@@ -122,16 +140,18 @@ IVFSQHybrid::search_impl(int64_t n, const float* data, int64_t k, float* distanc
 
 QuantizerPtr
 IVFSQHybrid::LoadQuantizer(const Config& conf) {
+//    std::lock_guard<std::mutex> lk(g_mutex);
+
     auto quantizer_conf = std::dynamic_pointer_cast<QuantizerCfg>(conf);
     if (quantizer_conf != nullptr) {
         if (quantizer_conf->mode != 1) {
             KNOWHERE_THROW_MSG("mode only support 1 in this func");
         }
     }
-    gpu_id_ = quantizer_conf->gpu_id;
+    auto gpu_id = quantizer_conf->gpu_id;
 
-    if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_)) {
-        ResScope rs(res, gpu_id_, false);
+    if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id)) {
+        ResScope rs(res, gpu_id, false);
         faiss::gpu::GpuClonerOptions option;
         option.allInGpu = true;
 
@@ -148,16 +168,19 @@ IVFSQHybrid::LoadQuantizer(const Config& conf) {
         auto& q_ptr = index_composition->quantizer;
         q->size = q_ptr->d * q_ptr->getNumVecs() * sizeof(float);
         q->quantizer = q_ptr;
+        q->gpu_id = gpu_id;
         res_ = res;
         gpu_mode = 1;
         return q;
     } else {
-        KNOWHERE_THROW_MSG("CopyCpuToGpu Error, can't get gpu: " + std::to_string(gpu_id_) + "resource");
+        KNOWHERE_THROW_MSG("CopyCpuToGpu Error, can't get gpu: " + std::to_string(gpu_id) + "resource");
     }
 }
 
 void
 IVFSQHybrid::SetQuantizer(const QuantizerPtr& q) {
+//    std::lock_guard<std::mutex> lk(g_mutex);
+
     auto ivf_quantizer = std::dynamic_pointer_cast<FaissIVFQuantizer>(q);
     if (ivf_quantizer == nullptr) {
         KNOWHERE_THROW_MSG("Quantizer type error");
@@ -170,20 +193,27 @@ IVFSQHybrid::SetQuantizer(const QuantizerPtr& q) {
         //        delete ivf_index->quantizer;
         ivf_index->quantizer = ivf_quantizer->quantizer;
     }
+    quantizer_gpu_id_ = ivf_quantizer->gpu_id;
+    gpu_mode = 1;
 }
 
 void
 IVFSQHybrid::UnsetQuantizer() {
+//    std::lock_guard<std::mutex> lk(g_mutex);
+
     auto* ivf_index = dynamic_cast<faiss::IndexIVF*>(index_.get());
     if (ivf_index == nullptr) {
         KNOWHERE_THROW_MSG("Index type error");
     }
 
     ivf_index->quantizer = nullptr;
+    quantizer_gpu_id_ = -1;
 }
 
 VectorIndexPtr
 IVFSQHybrid::LoadData(const knowhere::QuantizerPtr& q, const Config& conf) {
+//    std::lock_guard<std::mutex> lk(g_mutex);
+
     auto quantizer_conf = std::dynamic_pointer_cast<QuantizerCfg>(conf);
     if (quantizer_conf != nullptr) {
         if (quantizer_conf->mode != 2) {
@@ -192,13 +222,11 @@ IVFSQHybrid::LoadData(const knowhere::QuantizerPtr& q, const Config& conf) {
     } else {
         KNOWHERE_THROW_MSG("conf error");
     }
-    //    if (quantizer_conf->gpu_id != gpu_id_) {
-    //        KNOWHERE_THROW_MSG("quantizer and data must on the same gpu card");
-    //    }
-    gpu_id_ = quantizer_conf->gpu_id;
 
-    if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_)) {
-        ResScope rs(res, gpu_id_, false);
+    auto gpu_id = quantizer_conf->gpu_id;
+
+    if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id)) {
+        ResScope rs(res, gpu_id, false);
         faiss::gpu::GpuClonerOptions option;
         option.allInGpu = true;
 
@@ -211,18 +239,20 @@ IVFSQHybrid::LoadData(const knowhere::QuantizerPtr& q, const Config& conf) {
         index_composition->quantizer = ivf_quantizer->quantizer;
         index_composition->mode = quantizer_conf->mode;  // only 2
 
-        auto gpu_index = faiss::gpu::index_cpu_to_gpu(res->faiss_res.get(), gpu_id_, index_composition, &option);
+        auto gpu_index = faiss::gpu::index_cpu_to_gpu(res->faiss_res.get(), gpu_id, index_composition, &option);
         std::shared_ptr<faiss::Index> new_idx;
         new_idx.reset(gpu_index);
-        auto sq_idx = std::make_shared<IVFSQHybrid>(new_idx, gpu_id_, res);
+        auto sq_idx = std::make_shared<IVFSQHybrid>(new_idx, gpu_id, res);
         return sq_idx;
     } else {
-        KNOWHERE_THROW_MSG("CopyCpuToGpu Error, can't get gpu: " + std::to_string(gpu_id_) + "resource");
+        KNOWHERE_THROW_MSG("CopyCpuToGpu Error, can't get gpu: " + std::to_string(gpu_id) + "resource");
     }
 }
 
 std::pair<VectorIndexPtr, QuantizerPtr>
 IVFSQHybrid::CopyCpuToGpuWithQuantizer(const int64_t& device_id, const Config& config) {
+//    std::lock_guard<std::mutex> lk(g_mutex);
+
     if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(device_id)) {
         ResScope rs(res, device_id, false);
         faiss::gpu::GpuClonerOptions option;
@@ -242,12 +272,29 @@ IVFSQHybrid::CopyCpuToGpuWithQuantizer(const int64_t& device_id, const Config& c
         auto q = std::make_shared<FaissIVFQuantizer>();
         q->quantizer = index_composition.quantizer;
         q->size = index_composition.quantizer->d * index_composition.quantizer->getNumVecs() * sizeof(float);
+        q->gpu_id = device_id;
         return std::make_pair(new_idx, q);
     } else {
         KNOWHERE_THROW_MSG("CopyCpuToGpu Error, can't get gpu: " + std::to_string(gpu_id_) + "resource");
     }
 }
 
+void
+IVFSQHybrid::set_index_model(IndexModelPtr model) {
+    std::lock_guard<std::mutex> lk(mutex_);
+
+    auto host_index = std::static_pointer_cast<IVFIndexModel>(model);
+    if (auto gpures = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_)) {
+        ResScope rs(gpures, gpu_id_, false);
+        auto device_index = faiss::gpu::index_cpu_to_gpu(gpures->faiss_res.get(), gpu_id_, host_index->index_.get());
+        index_.reset(device_index);
+        res_ = gpures;
+        gpu_mode = 2;
+    } else {
+        KNOWHERE_THROW_MSG("load index model error, can't get gpu_resource");
+    }
+}
+
 FaissIVFQuantizer::~FaissIVFQuantizer() {
     if (quantizer != nullptr) {
         delete quantizer;
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQHybrid.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQHybrid.h
index f54c61c20f..87cc22931f 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQHybrid.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQHybrid.h
@@ -18,6 +18,8 @@
 #pragma once
 
 #include <faiss/index_io.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+
 #include <memory>
 #include <utility>
 
@@ -29,6 +31,7 @@ namespace knowhere {
 #ifdef CUSTOMIZATION
 struct FaissIVFQuantizer : public Quantizer {
     faiss::gpu::GpuIndexFlat* quantizer = nullptr;
+    int64_t gpu_id;
 
     ~FaissIVFQuantizer() override;
 };
@@ -52,6 +55,9 @@ class IVFSQHybrid : public GPUIVFSQ {
     }
 
  public:
+    void
+    set_index_model(IndexModelPtr model) override;
+
     QuantizerPtr
     LoadQuantizer(const Config& conf);
 
@@ -85,6 +91,7 @@ class IVFSQHybrid : public GPUIVFSQ {
 
  protected:
     int64_t gpu_mode = 0;  // 0,1,2
+    int64_t quantizer_gpu_id_ = -1;
 };
 
 }  // namespace knowhere
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/VectorIndex.h b/core/src/index/knowhere/knowhere/index/vector_index/VectorIndex.h
index 810c4d2ea4..6509458b7b 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/VectorIndex.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/VectorIndex.h
@@ -48,6 +48,7 @@ class VectorIndex : public Index {
     virtual void
     Seal() = 0;
 
+    // TODO(linxj): Deprecated
     virtual VectorIndexPtr
     Clone() = 0;
 
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/helpers/FaissIO.h b/core/src/index/knowhere/knowhere/index/vector_index/helpers/FaissIO.h
index 7cce5bbbac..a7f8f349e1 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/helpers/FaissIO.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/helpers/FaissIO.h
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include <faiss/AuxIndexStructures.h>
+#include <faiss/impl/io.h>
 
 namespace knowhere {
 
diff --git a/core/src/index/unittest/Helper.h b/core/src/index/unittest/Helper.h
index d11a484c03..8d4bb0f4ae 100644
--- a/core/src/index/unittest/Helper.h
+++ b/core/src/index/unittest/Helper.h
@@ -26,7 +26,7 @@
 #include "knowhere/index/vector_index/IndexIVFSQ.h"
 #include "knowhere/index/vector_index/IndexIVFSQHybrid.h"
 
-constexpr int DEVICEID = 0;
+int DEVICEID = 0;
 constexpr int64_t DIM = 128;
 constexpr int64_t NB = 10000;
 constexpr int64_t NQ = 10;
diff --git a/core/src/index/unittest/test_customized_index.cpp b/core/src/index/unittest/test_customized_index.cpp
index 1e0b1d932d..f9b48b8b67 100644
--- a/core/src/index/unittest/test_customized_index.cpp
+++ b/core/src/index/unittest/test_customized_index.cpp
@@ -16,17 +16,23 @@
 // under the License.
 
 #include <gtest/gtest.h>
+#include <thread>
 
 #include "unittest/Helper.h"
 #include "unittest/utils.h"
 
+#include "knowhere/common/Timer.h"
+
 class SingleIndexTest : public DataGen, public TestGpuIndexBase {
  protected:
     void
     SetUp() override {
         TestGpuIndexBase::SetUp();
-        Generate(DIM, NB, NQ);
-        k = K;
+        nb = 1000000;
+        nq = 1000;
+        dim = DIM;
+        Generate(dim, nb, nq);
+        k = 1000;
     }
 
     void
@@ -119,4 +125,114 @@ TEST_F(SingleIndexTest, IVFSQHybrid) {
     }
 }
 
+//TEST_F(SingleIndexTest, thread_safe) {
+//    assert(!xb.empty());
+//
+//    index_type = "IVFSQHybrid";
+//    index_ = IndexFactory(index_type);
+//    auto base = ParamGenerator::GetInstance().Gen(ParameterType::ivfsq);
+//    auto conf = std::dynamic_pointer_cast<knowhere::IVFSQCfg>(base);
+//    conf->nlist = 16384;
+//    conf->k = k;
+//    conf->nprobe = 10;
+//    conf->d = dim;
+//    auto preprocessor = index_->BuildPreprocessor(base_dataset, conf);
+//    index_->set_preprocessor(preprocessor);
+//
+//    auto model = index_->Train(base_dataset, conf);
+//    index_->set_index_model(model);
+//    index_->Add(base_dataset, conf);
+//    EXPECT_EQ(index_->Count(), nb);
+//    EXPECT_EQ(index_->Dimension(), dim);
+//
+//    auto binaryset = index_->Serialize();
+//
+//
+//
+//    auto cpu_idx = std::make_shared<knowhere::IVFSQHybrid>(DEVICEID);
+//    cpu_idx->Load(binaryset);
+//    auto pair = cpu_idx->CopyCpuToGpuWithQuantizer(DEVICEID, conf);
+//    auto quantizer = pair.second;
+//
+//    auto quantizer_conf = std::make_shared<knowhere::QuantizerCfg>();
+//    quantizer_conf->mode = 2;  // only copy data
+//    quantizer_conf->gpu_id = DEVICEID;
+//
+//    auto CopyAllToGpu = [&](int64_t search_count, bool do_search = false) {
+//        for (int i = 0; i < search_count; ++i) {
+//            auto gpu_idx = cpu_idx->CopyCpuToGpu(DEVICEID, conf);
+//            if (do_search) {
+//                auto result = gpu_idx->Search(query_dataset, conf);
+//                AssertAnns(result, nq, conf->k);
+//            }
+//        }
+//    };
+//
+//    auto hybrid_qt_idx = std::make_shared<knowhere::IVFSQHybrid>(DEVICEID);
+//    hybrid_qt_idx->Load(binaryset);
+//    auto SetQuantizerDoSearch = [&](int64_t search_count) {
+//        for (int i = 0; i < search_count; ++i) {
+//            hybrid_qt_idx->SetQuantizer(quantizer);
+//            auto result = hybrid_qt_idx->Search(query_dataset, conf);
+//            AssertAnns(result, nq, conf->k);
+//            //            PrintResult(result, nq, k);
+//            hybrid_qt_idx->UnsetQuantizer();
+//        }
+//    };
+//
+//    auto hybrid_data_idx = std::make_shared<knowhere::IVFSQHybrid>(DEVICEID);
+//    hybrid_data_idx->Load(binaryset);
+//    auto LoadDataDoSearch = [&](int64_t search_count, bool do_search = false) {
+//        for (int i = 0; i < search_count; ++i) {
+//            auto hybrid_idx = hybrid_data_idx->LoadData(quantizer, quantizer_conf);
+//            if (do_search) {
+//                auto result = hybrid_idx->Search(query_dataset, conf);
+////                AssertAnns(result, nq, conf->k);
+//            }
+//        }
+//    };
+//
+//    knowhere::TimeRecorder tc("");
+//    CopyAllToGpu(2000/2, false);
+//    tc.RecordSection("CopyAllToGpu witout search");
+//    CopyAllToGpu(400/2, true);
+//    tc.RecordSection("CopyAllToGpu with search");
+//    SetQuantizerDoSearch(6);
+//    tc.RecordSection("SetQuantizer with search");
+//    LoadDataDoSearch(2000/2, false);
+//    tc.RecordSection("LoadData without search");
+//    LoadDataDoSearch(400/2, true);
+//    tc.RecordSection("LoadData with search");
+//
+//    {
+//        std::thread t1(CopyAllToGpu, 2000, false);
+//        std::thread t2(CopyAllToGpu, 400, true);
+//        t1.join();
+//        t2.join();
+//    }
+//
+//    {
+//        std::thread t1(SetQuantizerDoSearch, 12);
+//        std::thread t2(CopyAllToGpu, 400, true);
+//        t1.join();
+//        t2.join();
+//    }
+//
+//    {
+//        std::thread t1(SetQuantizerDoSearch, 12);
+//        std::thread t2(LoadDataDoSearch, 400, true);
+//        t1.join();
+//        t2.join();
+//    }
+//
+//    {
+//        std::thread t1(LoadDataDoSearch, 2000, false);
+//        std::thread t2(LoadDataDoSearch, 400, true);
+//        t1.join();
+//        t2.join();
+//    }
+//
+//}
+
+
 #endif
diff --git a/core/src/index/unittest/test_ivf.cpp b/core/src/index/unittest/test_ivf.cpp
index fae27b0dd3..3fd3e16d0e 100644
--- a/core/src/index/unittest/test_ivf.cpp
+++ b/core/src/index/unittest/test_ivf.cpp
@@ -20,19 +20,12 @@
 #include <iostream>
 #include <thread>
 
-#include <faiss/AutoTune.h>
-#include <faiss/gpu/GpuAutoTune.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 
 #include "knowhere/common/Exception.h"
 #include "knowhere/common/Timer.h"
 #include "knowhere/index/vector_index/IndexGPUIVF.h"
-#include "knowhere/index/vector_index/IndexGPUIVFPQ.h"
-#include "knowhere/index/vector_index/IndexGPUIVFSQ.h"
 #include "knowhere/index/vector_index/IndexIVF.h"
-#include "knowhere/index/vector_index/IndexIVFPQ.h"
-#include "knowhere/index/vector_index/IndexIVFSQ.h"
-#include "knowhere/index/vector_index/IndexIVFSQHybrid.h"
 #include "knowhere/index/vector_index/helpers/Cloner.h"
 
 #include "unittest/Helper.h"
@@ -51,6 +44,9 @@ class IVFTest : public DataGen, public TestWithParam<::std::tuple<std::string, P
         ParameterType parameter_type;
         std::tie(index_type, parameter_type) = GetParam();
         // Init_with_default();
+//        nb = 1000000;
+//        nq = 1000;
+//        k = 1000;
         Generate(DIM, NB, NQ);
         index_ = IndexFactory(index_type);
         conf = ParamGenerator::GetInstance().Gen(parameter_type);
@@ -61,16 +57,6 @@ class IVFTest : public DataGen, public TestWithParam<::std::tuple<std::string, P
         knowhere::FaissGpuResourceMgr::GetInstance().Free();
     }
 
-    knowhere::VectorIndexPtr
-    ChooseTodo() {
-        std::vector<std::string> gpu_idx{"GPUIVFSQ"};
-        auto finder = std::find(gpu_idx.cbegin(), gpu_idx.cend(), index_type);
-        if (finder != gpu_idx.cend()) {
-            return knowhere::cloner::CopyCpuToGpu(index_, DEVICEID, knowhere::Config());
-        }
-        return index_;
-    }
-
  protected:
     std::string index_type;
     knowhere::Config conf;
@@ -100,8 +86,7 @@ TEST_P(IVFTest, ivf_basic) {
     EXPECT_EQ(index_->Count(), nb);
     EXPECT_EQ(index_->Dimension(), dim);
 
-    auto new_idx = ChooseTodo();
-    auto result = new_idx->Search(query_dataset, conf);
+    auto result = index_->Search(query_dataset, conf);
     AssertAnns(result, nq, conf->k);
     // PrintResult(result, nq, k);
 }
@@ -134,8 +119,7 @@ TEST_P(IVFTest, ivf_serialize) {
 
         index_->set_index_model(model);
         index_->Add(base_dataset, conf);
-        auto new_idx = ChooseTodo();
-        auto result = new_idx->Search(query_dataset, conf);
+        auto result = index_->Search(query_dataset, conf);
         AssertAnns(result, nq, conf->k);
     }
 
@@ -159,8 +143,7 @@ TEST_P(IVFTest, ivf_serialize) {
         index_->Load(binaryset);
         EXPECT_EQ(index_->Count(), nb);
         EXPECT_EQ(index_->Dimension(), dim);
-        auto new_idx = ChooseTodo();
-        auto result = new_idx->Search(query_dataset, conf);
+        auto result = index_->Search(query_dataset, conf);
         AssertAnns(result, nq, conf->k);
     }
 }
@@ -176,8 +159,7 @@ TEST_P(IVFTest, clone_test) {
     index_->Add(base_dataset, conf);
     EXPECT_EQ(index_->Count(), nb);
     EXPECT_EQ(index_->Dimension(), dim);
-    auto new_idx = ChooseTodo();
-    auto result = new_idx->Search(query_dataset, conf);
+    auto result = index_->Search(query_dataset, conf);
     AssertAnns(result, nq, conf->k);
     // PrintResult(result, nq, k);
 
@@ -210,12 +192,6 @@ TEST_P(IVFTest, clone_test) {
     //        }
     //    }
 
-    {
-        if (index_type == "IVFSQHybrid") {
-            return;
-        }
-    }
-
     {
         // copy from gpu to cpu
         std::vector<std::string> support_idx_vec{"GPUIVF", "GPUIVFSQ", "IVFSQHybrid"};
@@ -277,8 +253,7 @@ TEST_P(IVFTest, gpu_seal_test) {
     index_->Add(base_dataset, conf);
     EXPECT_EQ(index_->Count(), nb);
     EXPECT_EQ(index_->Dimension(), dim);
-    auto new_idx = ChooseTodo();
-    auto result = new_idx->Search(query_dataset, conf);
+    auto result = index_->Search(query_dataset, conf);
     AssertAnns(result, nq, conf->k);
 
     auto cpu_idx = knowhere::cloner::CopyGpuToCpu(index_, knowhere::Config());
diff --git a/core/src/scheduler/SchedInst.h b/core/src/scheduler/SchedInst.h
index 60033731ae..b9153d3bc3 100644
--- a/core/src/scheduler/SchedInst.h
+++ b/core/src/scheduler/SchedInst.h
@@ -94,6 +94,7 @@ class OptimizerInst {
             std::lock_guard<std::mutex> lock(mutex_);
             if (instance == nullptr) {
                 std::vector<PassPtr> pass_list;
+                pass_list.push_back(std::make_shared<LargeSQ8HPass>());
                 pass_list.push_back(std::make_shared<HybridPass>());
                 instance = std::make_shared<Optimizer>(pass_list);
             }
diff --git a/core/src/scheduler/optimizer/LargeSQ8HPass.cpp b/core/src/scheduler/optimizer/LargeSQ8HPass.cpp
index 8368a90000..0d5a81a7b6 100644
--- a/core/src/scheduler/optimizer/LargeSQ8HPass.cpp
+++ b/core/src/scheduler/optimizer/LargeSQ8HPass.cpp
@@ -26,48 +26,48 @@
 namespace milvus {
 namespace scheduler {
 
-// bool
-// LargeSQ8HPass::Run(const TaskPtr& task) {
-//    if (task->Type() != TaskType::SearchTask) {
-//        return false;
-//    }
-//
-//    auto search_task = std::static_pointer_cast<XSearchTask>(task);
-//    if (search_task->file_->engine_type_ != (int)engine::EngineType::FAISS_IVFSQ8H) {
-//        return false;
-//    }
-//
-//    auto search_job = std::static_pointer_cast<SearchJob>(search_task->job_.lock());
-//
-//    // TODO: future, Index::IVFSQ8H, if nq < threshold set cpu, else set gpu
-//    if (search_job->nq() < 100) {
-//        return false;
-//    }
-//
-//    std::vector<uint64_t> gpus = scheduler::get_gpu_pool();
-//    std::vector<int64_t> all_free_mem;
-//    for (auto& gpu : gpus) {
-//        auto cache = cache::GpuCacheMgr::GetInstance(gpu);
-//        auto free_mem = cache->CacheCapacity() - cache->CacheUsage();
-//        all_free_mem.push_back(free_mem);
-//    }
-//
-//    auto max_e = std::max_element(all_free_mem.begin(), all_free_mem.end());
-//    auto best_index = std::distance(all_free_mem.begin(), max_e);
-//    auto best_device_id = gpus[best_index];
-//
-//    ResourcePtr res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, best_device_id);
-//    if (not res_ptr) {
-//        SERVER_LOG_ERROR << "GpuResource " << best_device_id << " invalid.";
-//        // TODO: throw critical error and exit
-//        return false;
-//    }
-//
-//    auto label = std::make_shared<SpecResLabel>(std::weak_ptr<Resource>(res_ptr));
-//    task->label() = label;
-//
-//    return true;
-// }
+ bool
+ LargeSQ8HPass::Run(const TaskPtr& task) {
+    if (task->Type() != TaskType::SearchTask) {
+        return false;
+    }
+
+    auto search_task = std::static_pointer_cast<XSearchTask>(task);
+    if (search_task->file_->engine_type_ != (int)engine::EngineType::FAISS_IVFSQ8H) {
+        return false;
+    }
+
+    auto search_job = std::static_pointer_cast<SearchJob>(search_task->job_.lock());
+
+    // TODO: future, Index::IVFSQ8H, if nq < threshold set cpu, else set gpu
+    if (search_job->nq() < 100) {
+        return false;
+    }
+
+    std::vector<uint64_t> gpus = scheduler::get_gpu_pool();
+    std::vector<int64_t> all_free_mem;
+    for (auto& gpu : gpus) {
+        auto cache = cache::GpuCacheMgr::GetInstance(gpu);
+        auto free_mem = cache->CacheCapacity() - cache->CacheUsage();
+        all_free_mem.push_back(free_mem);
+    }
+
+    auto max_e = std::max_element(all_free_mem.begin(), all_free_mem.end());
+    auto best_index = std::distance(all_free_mem.begin(), max_e);
+    auto best_device_id = gpus[best_index];
+
+    ResourcePtr res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, best_device_id);
+    if (not res_ptr) {
+        SERVER_LOG_ERROR << "GpuResource " << best_device_id << " invalid.";
+        // TODO: throw critical error and exit
+        return false;
+    }
+
+    auto label = std::make_shared<SpecResLabel>(std::weak_ptr<Resource>(res_ptr));
+    task->label() = label;
+
+    return true;
+ }
 
 }  // namespace scheduler
 }  // namespace milvus
diff --git a/core/src/scheduler/optimizer/LargeSQ8HPass.h b/core/src/scheduler/optimizer/LargeSQ8HPass.h
index 3335a37cc7..49e658002f 100644
--- a/core/src/scheduler/optimizer/LargeSQ8HPass.h
+++ b/core/src/scheduler/optimizer/LargeSQ8HPass.h
@@ -37,8 +37,8 @@ class LargeSQ8HPass : public Pass {
     LargeSQ8HPass() = default;
 
  public:
-    //    bool
-    //    Run(const TaskPtr& task) override;
+    bool
+    Run(const TaskPtr& task) override;
 };
 
 using LargeSQ8HPassPtr = std::shared_ptr<LargeSQ8HPass>;
diff --git a/core/src/server/DBWrapper.cpp b/core/src/server/DBWrapper.cpp
index a5b892ad47..34c8d38faf 100644
--- a/core/src/server/DBWrapper.cpp
+++ b/core/src/server/DBWrapper.cpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <faiss/utils.h>
+#include <faiss/utils/distances.h>
 #include <omp.h>
 #include <cmath>
 #include <string>
diff --git a/core/src/wrapper/VecIndex.h b/core/src/wrapper/VecIndex.h
index 05da9ccc03..1729d583ae 100644
--- a/core/src/wrapper/VecIndex.h
+++ b/core/src/wrapper/VecIndex.h
@@ -70,6 +70,7 @@ class VecIndex : public cache::DataObj {
     virtual VecIndexPtr
     CopyToCpu(const Config& cfg = Config()) = 0;
 
+    // TODO(linxj): Deprecated
     virtual VecIndexPtr
     Clone() = 0;
 
diff --git a/core/unittest/wrapper/test_wrapper.cpp b/core/unittest/wrapper/test_wrapper.cpp
index f112fc7e65..2f8fd6fafe 100644
--- a/core/unittest/wrapper/test_wrapper.cpp
+++ b/core/unittest/wrapper/test_wrapper.cpp
@@ -74,7 +74,7 @@ INSTANTIATE_TEST_CASE_P(WrapperParam, KnowhereWrapperTest,
                                             10,
                                             10),
                             std::make_tuple(milvus::engine::IndexType::FAISS_IVFSQ8_CPU, "Default", DIM, NB, 10, 10),
-//                            std::make_tuple(milvus::engine::IndexType::FAISS_IVFSQ8_GPU, "Default", DIM, NB, 10, 10),
+                            std::make_tuple(milvus::engine::IndexType::FAISS_IVFSQ8_GPU, "Default", DIM, NB, 10, 10),
                             std::make_tuple(milvus::engine::IndexType::FAISS_IVFSQ8_MIX, "Default", DIM, NB, 10, 10),
 //                            std::make_tuple(IndexType::NSG_MIX, "Default", 128, 250000, 10, 10),
 //                            std::make_tuple(IndexType::SPTAG_KDT_RNT_CPU, "Default", 128, 250000, 10, 10),