diff --git a/internal/core/src/common/Common.cpp b/internal/core/src/common/Common.cpp index 78a86f5860..eaa15b1107 100644 --- a/internal/core/src/common/Common.cpp +++ b/internal/core/src/common/Common.cpp @@ -19,27 +19,27 @@ namespace milvus { -int64_t index_file_slice_size = DEFAULT_INDEX_FILE_SLICE_SIZE; -int64_t thread_core_coefficient = DEFAULT_THREAD_CORE_COEFFICIENT; -int cpu_num = DEFAULT_CPU_NUM; +int64_t FILE_SLICE_SIZE = DEFAULT_INDEX_FILE_SLICE_SIZE; +int64_t THREAD_CORE_COEFFICIENT = DEFAULT_THREAD_CORE_COEFFICIENT; +int CPU_NUM = DEFAULT_CPU_NUM; void SetIndexSliceSize(const int64_t size) { - index_file_slice_size = size; - LOG_SEGCORE_DEBUG_ << "set config index slice size: " - << index_file_slice_size; + FILE_SLICE_SIZE = size << 20; + LOG_SEGCORE_DEBUG_ << "set config index slice size (byte): " + << FILE_SLICE_SIZE; } void SetThreadCoreCoefficient(const int64_t coefficient) { - thread_core_coefficient = coefficient; + THREAD_CORE_COEFFICIENT = coefficient; LOG_SEGCORE_DEBUG_ << "set thread pool core coefficient: " - << thread_core_coefficient; + << THREAD_CORE_COEFFICIENT; } void SetCpuNum(const int num) { - cpu_num = num; + CPU_NUM = num; } } // namespace milvus diff --git a/internal/core/src/common/Common.h b/internal/core/src/common/Common.h index be1153f999..bf637aa1f3 100644 --- a/internal/core/src/common/Common.h +++ b/internal/core/src/common/Common.h @@ -21,9 +21,9 @@ namespace milvus { -extern int64_t index_file_slice_size; -extern int64_t thread_core_coefficient; -extern int cpu_num; +extern int64_t FILE_SLICE_SIZE; +extern int64_t THREAD_CORE_COEFFICIENT; +extern int CPU_NUM; void SetIndexSliceSize(const int64_t size); diff --git a/internal/core/src/common/Consts.h b/internal/core/src/common/Consts.h index e8b5b569ed..ea07272ce3 100644 --- a/internal/core/src/common/Consts.h +++ b/internal/core/src/common/Consts.h @@ -39,10 +39,10 @@ const char INDEX_BUILD_ID_KEY[] = "indexBuildID"; const char INDEX_ROOT_PATH[] = "index_files"; const char RAWDATA_ROOT_PATH[] = "raw_datas"; -const int64_t DEFAULT_DISK_INDEX_MAX_MEMORY_LIMIT = 67108864; // bytes +const int64_t DEFAULT_FIELD_MAX_MEMORY_LIMIT = 67108864; // bytes const int64_t DEFAULT_THREAD_CORE_COEFFICIENT = 50; -const int64_t DEFAULT_INDEX_FILE_SLICE_SIZE = 4; // megabytes +const int64_t DEFAULT_INDEX_FILE_SLICE_SIZE = 4194304; // bytes const int DEFAULT_CPU_NUM = 1; diff --git a/internal/core/src/common/LoadInfo.h b/internal/core/src/common/LoadInfo.h index 319eeeec30..372a5527ac 100644 --- a/internal/core/src/common/LoadInfo.h +++ b/internal/core/src/common/LoadInfo.h @@ -18,19 +18,24 @@ #include #include +#include #include "Types.h" #include "common/CDataType.h" // NOTE: field_id can be system field // NOTE: Refer to common/SystemProperty.cpp for details -// TODO: use arrow to pass field data instead of proto -struct LoadFieldDataInfo { +struct FieldBinlogInfo { int64_t field_id; - // const void* blob = nullptr; - const milvus::DataArray* field_data; - int64_t row_count{-1}; - const char* mmap_dir_path{nullptr}; + int64_t row_count = -1; + std::vector insert_files; +}; + +struct LoadFieldDataInfo { + std::map field_infos; + // Set null to disable mmap, + // mmap file path will be {mmap_dir_path}/{segment_id}/{field_id} + std::string mmap_dir_path = ""; }; struct LoadDeletedRecordInfo { diff --git a/internal/core/src/common/Slice.cpp b/internal/core/src/common/Slice.cpp index 3ea398e0b8..362615c9ef 100644 --- a/internal/core/src/common/Slice.cpp +++ b/internal/core/src/common/Slice.cpp @@ -20,11 +20,10 @@ namespace milvus { -static const char* INDEX_FILE_SLICE_META = "SLICE_META"; -static const char* META = "meta"; -static const char* NAME = "name"; -static const char* SLICE_NUM = "slice_num"; -static const char* TOTAL_LEN = "total_len"; +std::string +GenSlicedFileName(const std::string& prefix, size_t slice_num) { + return prefix + "_" + std::to_string(slice_num); +} void Slice(const std::string& prefix, @@ -42,8 +41,7 @@ Slice(const std::string& prefix, auto size = static_cast(ri - i); auto slice_i = std::shared_ptr(new uint8_t[size]); memcpy(slice_i.get(), data_src->data.get() + i, size); - binarySet.Append( - prefix + "_" + std::to_string(slice_num), slice_i, ri - i); + binarySet.Append(GenSlicedFileName(prefix, slice_num), slice_i, ri - i); i = ri; } ret[NAME] = prefix; @@ -68,7 +66,7 @@ Assemble(BinarySet& binarySet) { auto p_data = std::shared_ptr(new uint8_t[total_len]); int64_t pos = 0; for (auto i = 0; i < slice_num; ++i) { - auto slice_i_sp = binarySet.Erase(prefix + "_" + std::to_string(i)); + auto slice_i_sp = binarySet.Erase(GenSlicedFileName(prefix, i)); memcpy(p_data.get() + pos, slice_i_sp->data.get(), static_cast(slice_i_sp->size)); @@ -90,17 +88,15 @@ Disassemble(BinarySet& binarySet) { } } - const int64_t slice_size_in_byte = index_file_slice_size << 20; std::vector slice_key_list; for (auto& kv : binarySet.binary_map_) { - if (kv.second->size > slice_size_in_byte) { + if (kv.second->size > FILE_SLICE_SIZE) { slice_key_list.push_back(kv.first); } } for (auto& key : slice_key_list) { Config slice_i; - Slice( - key, binarySet.Erase(key), slice_size_in_byte, binarySet, slice_i); + Slice(key, binarySet.Erase(key), FILE_SLICE_SIZE, binarySet, slice_i); meta_info[META].emplace_back(slice_i); } if (!slice_key_list.empty()) { diff --git a/internal/core/src/common/Slice.h b/internal/core/src/common/Slice.h index ef51e3fb51..d08b988343 100644 --- a/internal/core/src/common/Slice.h +++ b/internal/core/src/common/Slice.h @@ -20,6 +20,16 @@ namespace milvus { +// used for disassemble and assemble index data +const char INDEX_FILE_SLICE_META[] = "SLICE_META"; +const char META[] = "meta"; +const char NAME[] = "name"; +const char SLICE_NUM[] = "slice_num"; +const char TOTAL_LEN[] = "total_len"; + +std::string +GenSlicedFileName(const std::string& prefix, size_t slice_num); + void Assemble(BinarySet& binarySet); diff --git a/internal/core/src/common/Utils.h b/internal/core/src/common/Utils.h index 5f80678da2..3c79d77bf7 100644 --- a/internal/core/src/common/Utils.h +++ b/internal/core/src/common/Utils.h @@ -28,7 +28,6 @@ #include "common/FieldMeta.h" #include "common/LoadInfo.h" #include "common/Types.h" -#include "config/ConfigChunkManager.h" #include "exceptions/EasyAssert.h" #include "knowhere/dataset.h" #include "knowhere/expected.h" @@ -209,263 +208,24 @@ MatchKnowhereError(knowhere::Status status) { } } -inline size_t -GetDataSize(const FieldMeta& field, size_t row_count, const DataArray* data) { - auto data_type = field.get_data_type(); - if (datatype_is_variable(data_type)) { - switch (data_type) { - case DataType::VARCHAR: - case DataType::STRING: { - ssize_t size{}; - for (auto& data : FIELD_DATA(data, string)) { - size += data.size(); - } - return size; - } - case DataType::JSON: { - ssize_t size{}; - for (auto& data : FIELD_DATA(data, json)) { - size += data.size(); - } - return size; - } - default: - PanicInfo(fmt::format("not supported data type {}", - datatype_name(data_type))); - } - } - - return field.get_sizeof() * row_count; +inline std::vector +DISK_INDEX_LIST() { + static std::vector ret{ + knowhere::IndexEnum::INDEX_DISKANN, + }; + return ret; } -inline void* -FillField(DataType data_type, - size_t size, - const LoadFieldDataInfo& info, - void* dst) { - auto data = info.field_data; - switch (data_type) { - case DataType::BOOL: { - return memcpy(dst, FIELD_DATA(data, bool).data(), size); - } - case DataType::INT8: { - auto src_data = FIELD_DATA(data, int); - std::vector data_raw(src_data.size()); - std::copy_n(src_data.data(), src_data.size(), data_raw.data()); - return memcpy(dst, data_raw.data(), size); - } - case DataType::INT16: { - auto src_data = FIELD_DATA(data, int); - std::vector data_raw(src_data.size()); - std::copy_n(src_data.data(), src_data.size(), data_raw.data()); - return memcpy(dst, data_raw.data(), size); - } - case DataType::INT32: { - return memcpy(dst, FIELD_DATA(data, int).data(), size); - } - case DataType::INT64: { - return memcpy(dst, FIELD_DATA(data, long).data(), size); - } - case DataType::FLOAT: { - return memcpy(dst, FIELD_DATA(data, float).data(), size); - } - case DataType::DOUBLE: { - return memcpy(dst, FIELD_DATA(data, double).data(), size); - } - case DataType::VARCHAR: { - char* dest = reinterpret_cast(dst); - for (auto& data : FIELD_DATA(data, string)) { - memcpy(dest, data.data(), data.size()); - dest += data.size(); - } - return dst; - } - - case DataType::JSON: { - char* dest = reinterpret_cast(dst); - for (auto& data : FIELD_DATA(data, json)) { - memcpy(dest, data.data(), data.size()); - dest += data.size(); - } - return dst; - } - - case DataType::VECTOR_FLOAT: - return memcpy(dst, VEC_FIELD_DATA(data, float).data(), size); - - case DataType::VECTOR_BINARY: - return memcpy(dst, VEC_FIELD_DATA(data, binary), size); - - default: { - PanicInfo("unsupported"); - } - } +template +inline bool +is_in_list(const T& t, std::function()> list_func) { + auto l = list_func(); + return std::find(l.begin(), l.end(), t) != l.end(); } -inline ssize_t -WriteFieldData(int fd, DataType data_type, const DataArray* data, size_t size) { - switch (data_type) { - case DataType::BOOL: { - return write(fd, FIELD_DATA(data, bool).data(), size); - } - case DataType::INT8: { - auto src_data = FIELD_DATA(data, int); - std::vector data_raw(src_data.size()); - std::copy_n(src_data.data(), src_data.size(), data_raw.data()); - return write(fd, data_raw.data(), size); - } - case DataType::INT16: { - auto src_data = FIELD_DATA(data, int); - std::vector data_raw(src_data.size()); - std::copy_n(src_data.data(), src_data.size(), data_raw.data()); - return write(fd, data_raw.data(), size); - } - case DataType::INT32: { - return write(fd, FIELD_DATA(data, int).data(), size); - } - case DataType::INT64: { - return write(fd, FIELD_DATA(data, long).data(), size); - } - case DataType::FLOAT: { - return write(fd, FIELD_DATA(data, float).data(), size); - } - case DataType::DOUBLE: { - return write(fd, FIELD_DATA(data, double).data(), size); - } - case DataType::VARCHAR: { - ssize_t total_written{0}; - for (auto& str : FIELD_DATA(data, string)) { - ssize_t written = write(fd, str.data(), str.size()); - if (written < str.size()) { - break; - } - total_written += written; - } - return total_written; - } - case DataType::JSON: { - ssize_t total_written{0}; - for (auto& json : FIELD_DATA(data, json)) { - ssize_t written = write(fd, json.data(), json.size()); - if (written < json.size()) { - break; - } - total_written += written; - } - return total_written; - } - case DataType::VECTOR_FLOAT: - return write(fd, VEC_FIELD_DATA(data, float).data(), size); - - case DataType::VECTOR_BINARY: - return write(fd, VEC_FIELD_DATA(data, binary), size); - - default: { - PanicInfo("unsupported"); - } - } -} - -// CreateMap creates a memory mapping, -// if mmap enabled, this writes field data to disk and create a map to the file, -// otherwise this just alloc memory -inline void* -CreateMap(int64_t segment_id, - const FieldMeta& field_meta, - const LoadFieldDataInfo& info) { - static int mmap_flags = MAP_PRIVATE; -#ifdef MAP_POPULATE - // macOS doesn't support MAP_POPULATE - mmap_flags |= MAP_POPULATE; -#endif - - // simdjson requires a padding following the json data - size_t padding = field_meta.get_data_type() == DataType::JSON - ? simdjson::SIMDJSON_PADDING - : 0; - // Allocate memory - if (info.mmap_dir_path == nullptr) { - auto data_type = field_meta.get_data_type(); - auto data_size = - GetDataSize(field_meta, info.row_count, info.field_data); - if (data_size == 0) - return nullptr; - - // Use anon mapping so we are able to free these memory with munmap only - void* map = mmap(nullptr, - data_size + padding, - PROT_READ | PROT_WRITE, - mmap_flags | MAP_ANON, - -1, - 0); - AssertInfo( - map != MAP_FAILED, - fmt::format("failed to create anon map, err: {}", strerror(errno))); - FillField(data_type, data_size, info, map); - return map; - } - - auto filepath = std::filesystem::path(info.mmap_dir_path) / - std::to_string(segment_id) / std::to_string(info.field_id); - auto dir = filepath.parent_path(); - std::filesystem::create_directories(dir); - - int fd = - open(filepath.c_str(), O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR); - AssertInfo(fd != -1, - fmt::format("failed to create mmap file {}", filepath.c_str())); - - auto data_type = field_meta.get_data_type(); - size_t size = field_meta.get_sizeof() * info.row_count; - auto written = WriteFieldData(fd, data_type, info.field_data, size); - AssertInfo( - written == size || - written != -1 && datatype_is_variable(field_meta.get_data_type()), - fmt::format( - "failed to write data file {}, written {} but total {}, err: {}", - filepath.c_str(), - written, - size, - strerror(errno))); - int ok = fsync(fd); - AssertInfo(ok == 0, - fmt::format("failed to fsync mmap data file {}, err: {}", - filepath.c_str(), - strerror(errno))); - - // Empty field - if (written == 0) { - return nullptr; - } - - auto map = mmap(nullptr, written + padding, PROT_READ, mmap_flags, fd, 0); - AssertInfo(map != MAP_FAILED, - fmt::format("failed to create map for data file {}, err: {}", - filepath.c_str(), - strerror(errno))); - -#ifndef MAP_POPULATE - // Manually access the mapping to populate it - const size_t page_size = getpagesize(); - char* begin = (char*)map; - char* end = begin + written; - for (char* page = begin; page < end; page += page_size) { - char value = page[0]; - } -#endif - // unlink this data file so - // then it will be auto removed after we don't need it again - ok = unlink(filepath.c_str()); - AssertInfo(ok == 0, - fmt::format("failed to unlink mmap data file {}, err: {}", - filepath.c_str(), - strerror(errno))); - ok = close(fd); - AssertInfo(ok == 0, - fmt::format("failed to close data file {}, err: {}", - filepath.c_str(), - strerror(errno))); - return map; +inline bool +is_in_disk_list(const IndexType& index_type) { + return is_in_list(index_type, DISK_INDEX_LIST); } } // namespace milvus diff --git a/internal/core/src/common/init_c.cpp b/internal/core/src/common/init_c.cpp index a9ac5f5505..37b4e5ae7a 100644 --- a/internal/core/src/common/init_c.cpp +++ b/internal/core/src/common/init_c.cpp @@ -20,36 +20,24 @@ #include "common/init_c.h" #include -#include "config/ConfigChunkManager.h" #include "common/Slice.h" #include "common/Common.h" #include "common/Tracer.h" #include "log/Log.h" -std::once_flag flag1, flag2, flag3, flag4; +std::once_flag flag1, flag2, flag3; std::once_flag traceFlag; -void -InitLocalRootPath(const char* root_path) { - std::string local_path_root(root_path); - std::call_once( - flag1, - [](std::string path) { - milvus::ChunkMangerConfig::SetLocalRootPath(path); - }, - local_path_root); -} - void InitIndexSliceSize(const int64_t size) { std::call_once( - flag2, [](int64_t size) { milvus::SetIndexSliceSize(size); }, size); + flag1, [](int64_t size) { milvus::SetIndexSliceSize(size); }, size); } void InitThreadCoreCoefficient(const int64_t value) { std::call_once( - flag3, + flag2, [](int64_t value) { milvus::SetThreadCoreCoefficient(value); }, value); } @@ -57,7 +45,7 @@ InitThreadCoreCoefficient(const int64_t value) { void InitCpuNum(const int value) { std::call_once( - flag4, [](int value) { milvus::SetCpuNum(value); }, value); + flag3, [](int value) { milvus::SetCpuNum(value); }, value); } void diff --git a/internal/core/src/common/init_c.h b/internal/core/src/common/init_c.h index 4ee9373429..2a01efee46 100644 --- a/internal/core/src/common/init_c.h +++ b/internal/core/src/common/init_c.h @@ -33,9 +33,6 @@ InitThreadCoreCoefficient(const int64_t); void InitCpuNum(const int); -void -InitLocalRootPath(const char*); - void InitTrace(CTraceConfig* config); diff --git a/internal/core/src/common/type_c.h b/internal/core/src/common/type_c.h index 425828258f..aeb96d7648 100644 --- a/internal/core/src/common/type_c.h +++ b/internal/core/src/common/type_c.h @@ -69,16 +69,6 @@ typedef struct CProto { int64_t proto_size; } CProto; -typedef struct CLoadFieldDataInfo { - int64_t field_id; - const uint8_t* blob; - uint64_t blob_size; - int64_t row_count; - // Set null to disable mmap, - // mmap file path will be {mmap_dir_path}/{segment_id}/{field_id} - const char* mmap_dir_path; -} CLoadFieldDataInfo; - typedef struct CLoadDeletedRecordInfo { void* timestamps; const uint8_t* primary_keys; @@ -91,7 +81,7 @@ typedef struct CStorageConfig { const char* bucket_name; const char* access_key_id; const char* access_key_value; - const char* remote_root_path; + const char* root_path; const char* storage_type; const char* iam_endpoint; bool useSSL; diff --git a/internal/core/src/config/CMakeLists.txt b/internal/core/src/config/CMakeLists.txt index 60f019ef73..a167af69b3 100644 --- a/internal/core/src/config/CMakeLists.txt +++ b/internal/core/src/config/CMakeLists.txt @@ -22,7 +22,6 @@ endif() set(CONFIG_SRC ConfigKnowhere.cpp - ConfigChunkManager.cpp ) add_library(milvus_config STATIC ${CONFIG_SRC}) diff --git a/internal/core/src/index/BoolIndex.h b/internal/core/src/index/BoolIndex.h index 10894e5c7d..3e18a127f0 100644 --- a/internal/core/src/index/BoolIndex.h +++ b/internal/core/src/index/BoolIndex.h @@ -22,12 +22,10 @@ namespace milvus::index { -//// TODO: optimize here. -class BoolIndex : public ScalarIndexSort {}; -using BoolIndexPtr = std::shared_ptr; +using BoolIndexPtr = std::shared_ptr>; inline BoolIndexPtr -CreateBoolIndex() { - return std::make_unique(); +CreateBoolIndex(storage::FileManagerImplPtr file_manager = nullptr) { + return std::make_unique>(file_manager); } } // namespace milvus::index diff --git a/internal/core/src/index/Index.h b/internal/core/src/index/Index.h index 2976a83f3e..3d488f36b6 100644 --- a/internal/core/src/index/Index.h +++ b/internal/core/src/index/Index.h @@ -33,6 +33,9 @@ class IndexBase { virtual void Load(const BinarySet& binary_set, const Config& config = {}) = 0; + virtual void + Load(const Config& config = {}) = 0; + virtual void BuildWithRawData(size_t n, const void* values, @@ -41,9 +44,15 @@ class IndexBase { virtual void BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) = 0; + virtual void + Build(const Config& config = {}) = 0; + virtual int64_t Count() = 0; + virtual BinarySet + Upload(const Config& config = {}) = 0; + protected: IndexType index_type_ = ""; }; diff --git a/internal/core/src/index/IndexFactory-inl.h b/internal/core/src/index/IndexFactory-inl.h index c4e5e7254e..f8c120fb23 100644 --- a/internal/core/src/index/IndexFactory-inl.h +++ b/internal/core/src/index/IndexFactory-inl.h @@ -23,8 +23,9 @@ namespace milvus::index { template inline ScalarIndexPtr -IndexFactory::CreateScalarIndex(const IndexType& index_type) { - return CreateScalarIndexSort(); +IndexFactory::CreateScalarIndex(const IndexType& index_type, + storage::FileManagerImplPtr file_manager) { + return CreateScalarIndexSort(file_manager); } // template <> @@ -35,9 +36,10 @@ IndexFactory::CreateScalarIndex(const IndexType& index_type) { template <> inline ScalarIndexPtr -IndexFactory::CreateScalarIndex(const IndexType& index_type) { +IndexFactory::CreateScalarIndex(const IndexType& index_type, + storage::FileManagerImplPtr file_manager) { #if defined(__linux__) || defined(__APPLE__) - return CreateStringIndexMarisa(); + return CreateStringIndexMarisa(file_manager); #else throw std::runtime_error("unsupported platform"); #endif diff --git a/internal/core/src/index/IndexFactory.cpp b/internal/core/src/index/IndexFactory.cpp index 0b593cdd5d..c4c6178e96 100644 --- a/internal/core/src/index/IndexFactory.cpp +++ b/internal/core/src/index/IndexFactory.cpp @@ -33,35 +33,36 @@ IndexFactory::CreateIndex(const CreateIndexInfo& create_index_info, return CreateVectorIndex(create_index_info, file_manager); } - return CreateScalarIndex(create_index_info); + return CreateScalarIndex(create_index_info, file_manager); } IndexBasePtr -IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info) { +IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info, + storage::FileManagerImplPtr file_manager) { auto data_type = create_index_info.field_type; auto index_type = create_index_info.index_type; switch (data_type) { // create scalar index case DataType::BOOL: - return CreateScalarIndex(index_type); + return CreateScalarIndex(index_type, file_manager); case DataType::INT8: - return CreateScalarIndex(index_type); + return CreateScalarIndex(index_type, file_manager); case DataType::INT16: - return CreateScalarIndex(index_type); + return CreateScalarIndex(index_type, file_manager); case DataType::INT32: - return CreateScalarIndex(index_type); + return CreateScalarIndex(index_type, file_manager); case DataType::INT64: - return CreateScalarIndex(index_type); + return CreateScalarIndex(index_type, file_manager); case DataType::FLOAT: - return CreateScalarIndex(index_type); + return CreateScalarIndex(index_type, file_manager); case DataType::DOUBLE: - return CreateScalarIndex(index_type); + return CreateScalarIndex(index_type, file_manager); // create string index case DataType::STRING: case DataType::VARCHAR: - return CreateScalarIndex(index_type); + return CreateScalarIndex(index_type, file_manager); default: throw std::invalid_argument( std::string("invalid data type to build index: ") + @@ -93,10 +94,12 @@ IndexFactory::CreateVectorIndex(const CreateIndexInfo& create_index_info, #endif if (is_in_nm_list(index_type)) { - return std::make_unique(index_type, metric_type); + return std::make_unique( + index_type, metric_type, file_manager); } // create mem index - return std::make_unique(index_type, metric_type); + return std::make_unique( + index_type, metric_type, file_manager); } } // namespace milvus::index diff --git a/internal/core/src/index/IndexFactory.h b/internal/core/src/index/IndexFactory.h index daa525f29c..ec1c9cfdf8 100644 --- a/internal/core/src/index/IndexFactory.h +++ b/internal/core/src/index/IndexFactory.h @@ -21,7 +21,6 @@ #include #include "common/type_c.h" -#include "config/ConfigChunkManager.h" #include "index/Index.h" #include "index/ScalarIndex.h" #include "index/VectorIndex.h" @@ -29,11 +28,6 @@ #include "storage/Types.h" #include "storage/FileManager.h" -#ifdef BUILD_DISK_ANN -#include "storage/LocalChunkManager.h" -#include "storage/MinioChunkManager.h" -#endif - namespace milvus::index { class IndexFactory { @@ -61,14 +55,16 @@ class IndexFactory { storage::FileManagerImplPtr file_manager); IndexBasePtr - CreateScalarIndex(const CreateIndexInfo& create_index_info); + CreateScalarIndex(const CreateIndexInfo& create_index_info, + storage::FileManagerImplPtr file_manager = nullptr); // IndexBasePtr // CreateIndex(DataType dtype, const IndexType& index_type); private: template ScalarIndexPtr - CreateScalarIndex(const IndexType& index_type); + CreateScalarIndex(const IndexType& index_type, + storage::FileManagerImplPtr file_manager = nullptr); }; } // namespace milvus::index diff --git a/internal/core/src/index/ScalarIndexSort-inl.h b/internal/core/src/index/ScalarIndexSort-inl.h index 267d0e4586..9338655dc8 100644 --- a/internal/core/src/index/ScalarIndexSort-inl.h +++ b/internal/core/src/index/ScalarIndexSort-inl.h @@ -24,22 +24,64 @@ #include "Meta.h" #include "common/Utils.h" #include "common/Slice.h" +#include "index/Utils.h" namespace milvus::index { template -inline ScalarIndexSort::ScalarIndexSort() : is_built_(false), data_() { -} - -template -inline ScalarIndexSort::ScalarIndexSort(const size_t n, const T* values) - : is_built_(false) { - ScalarIndexSort::BuildWithDataset(n, values); +inline ScalarIndexSort::ScalarIndexSort( + storage::FileManagerImplPtr file_manager) + : is_built_(false), data_() { + if (file_manager != nullptr) { + file_manager_ = std::dynamic_pointer_cast( + file_manager); + } } template inline void -ScalarIndexSort::Build(const size_t n, const T* values) { +ScalarIndexSort::Build(const Config& config) { + if (is_built_) + return; + auto insert_files = + GetValueFromConfig>(config, "insert_files"); + AssertInfo(insert_files.has_value(), + "insert file paths is empty when build index"); + auto field_datas = + file_manager_->CacheRawDataToMemory(insert_files.value()); + + int64_t total_num_rows = 0; + for (auto data : field_datas) { + total_num_rows += data->get_num_rows(); + } + if (total_num_rows == 0) { + // todo: throw an exception + throw std::invalid_argument( + "ScalarIndexSort cannot build null values!"); + } + + data_.reserve(total_num_rows); + int64_t offset = 0; + for (auto data : field_datas) { + auto slice_num = data->get_num_rows(); + for (size_t i = 0; i < slice_num; ++i) { + auto value = reinterpret_cast(data->RawValue(i)); + data_.emplace_back(IndexStructure(*value, offset)); + offset++; + } + } + + std::sort(data_.begin(), data_.end()); + idx_to_offsets_.resize(total_num_rows); + for (size_t i = 0; i < total_num_rows; ++i) { + idx_to_offsets_[data_[i].idx_] = i; + } + is_built_ = true; +} + +template +inline void +ScalarIndexSort::Build(size_t n, const T* values) { if (is_built_) return; if (n == 0) { @@ -82,11 +124,26 @@ ScalarIndexSort::Serialize(const Config& config) { return res_set; } +template +inline BinarySet +ScalarIndexSort::Upload(const Config& config) { + auto binary_set = Serialize(config); + file_manager_->AddFile(binary_set); + + auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize(); + BinarySet ret; + for (auto& file : remote_paths_to_size) { + ret.Append(file.first, nullptr, file.second); + } + + return ret; +} + template inline void -ScalarIndexSort::Load(const BinarySet& index_binary, const Config& config) { +ScalarIndexSort::LoadWithoutAssemble(const BinarySet& index_binary, + const Config& config) { size_t index_size; - milvus::Assemble(const_cast(index_binary)); auto index_length = index_binary.GetByName("index_length"); memcpy(&index_size, index_length->data.get(), (size_t)index_length->size); @@ -100,6 +157,34 @@ ScalarIndexSort::Load(const BinarySet& index_binary, const Config& config) { is_built_ = true; } +template +inline void +ScalarIndexSort::Load(const BinarySet& index_binary, const Config& config) { + milvus::Assemble(const_cast(index_binary)); + LoadWithoutAssemble(index_binary, config); +} + +template +inline void +ScalarIndexSort::Load(const Config& config) { + auto index_files = + GetValueFromConfig>(config, "index_files"); + AssertInfo(index_files.has_value(), + "index file paths is empty when load disk ann index"); + auto index_datas = file_manager_->LoadIndexToMemory(index_files.value()); + AssembleIndexDatas(index_datas); + BinarySet binary_set; + for (auto& [key, data] : index_datas) { + auto size = data->Size(); + auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction + auto buf = std::shared_ptr( + (uint8_t*)const_cast(data->Data()), deleter); + binary_set.Append(key, buf, size); + } + + LoadWithoutAssemble(binary_set, config); +} + template inline const TargetBitmap ScalarIndexSort::In(const size_t n, const T* values) { diff --git a/internal/core/src/index/ScalarIndexSort.h b/internal/core/src/index/ScalarIndexSort.h index 3e1fc6851c..9003c57dc7 100644 --- a/internal/core/src/index/ScalarIndexSort.h +++ b/internal/core/src/index/ScalarIndexSort.h @@ -21,16 +21,19 @@ #include #include #include +#include + #include "index/IndexStructure.h" #include "index/ScalarIndex.h" +#include "storage/MemFileManagerImpl.h" namespace milvus::index { template class ScalarIndexSort : public ScalarIndex { public: - ScalarIndexSort(); - ScalarIndexSort(size_t n, const T* values); + explicit ScalarIndexSort( + storage::FileManagerImplPtr file_manager = nullptr); BinarySet Serialize(const Config& config) override; @@ -38,6 +41,9 @@ class ScalarIndexSort : public ScalarIndex { void Load(const BinarySet& index_binary, const Config& config = {}) override; + void + Load(const Config& config = {}) override; + int64_t Count() override { return data_.size(); @@ -46,6 +52,9 @@ class ScalarIndexSort : public ScalarIndex { void Build(size_t n, const T* values) override; + void + Build(const Config& config = {}) override; + const TargetBitmap In(size_t n, const T* values) override; @@ -69,6 +78,9 @@ class ScalarIndexSort : public ScalarIndex { return (int64_t)data_.size(); } + BinarySet + Upload(const Config& config = {}) override; + public: const std::vector>& GetData() { @@ -80,11 +92,15 @@ class ScalarIndexSort : public ScalarIndex { return is_built_; } + void + LoadWithoutAssemble(const BinarySet& binary_set, const Config& config); + private: bool is_built_; Config config_; std::vector idx_to_offsets_; // used to retrieve. std::vector> data_; + std::shared_ptr file_manager_; }; template @@ -97,7 +113,7 @@ using ScalarIndexSortPtr = std::unique_ptr>; namespace milvus::index { template inline ScalarIndexSortPtr -CreateScalarIndexSort() { - return std::make_unique>(); +CreateScalarIndexSort(storage::FileManagerImplPtr file_manager = nullptr) { + return std::make_unique>(file_manager); } } // namespace milvus::index diff --git a/internal/core/src/index/StringIndexMarisa.cpp b/internal/core/src/index/StringIndexMarisa.cpp index 8b68750bcf..660594aaae 100644 --- a/internal/core/src/index/StringIndexMarisa.cpp +++ b/internal/core/src/index/StringIndexMarisa.cpp @@ -31,11 +31,77 @@ namespace milvus::index { #if defined(__linux__) || defined(__APPLE__) +class UnistdException : public std::runtime_error { + public: + explicit UnistdException(const std::string& msg) : std::runtime_error(msg) { + } + + virtual ~UnistdException() { + } +}; + +StringIndexMarisa::StringIndexMarisa(storage::FileManagerImplPtr file_manager) { + if (file_manager != nullptr) { + file_manager_ = std::dynamic_pointer_cast( + file_manager); + } +} + int64_t StringIndexMarisa::Size() { return trie_.size(); } +bool +valid_str_id(size_t str_id) { + return str_id >= 0 && str_id != MARISA_INVALID_KEY_ID; +} + +void +StringIndexMarisa::Build(const Config& config) { + if (built_) { + throw std::runtime_error("index has been built"); + } + + auto insert_files = + GetValueFromConfig>(config, "insert_files"); + AssertInfo(insert_files.has_value(), + "insert file paths is empty when build index"); + auto field_datas = + file_manager_->CacheRawDataToMemory(insert_files.value()); + int64_t total_num_rows = 0; + + // fill key set. + marisa::Keyset keyset; + for (auto data : field_datas) { + auto slice_num = data->get_num_rows(); + for (size_t i = 0; i < slice_num; ++i) { + keyset.push_back( + (*static_cast(data->RawValue(i))).c_str()); + } + total_num_rows += slice_num; + } + trie_.build(keyset); + + // fill str_ids_ + str_ids_.resize(total_num_rows); + int64_t offset = 0; + for (auto data : field_datas) { + auto slice_num = data->get_num_rows(); + for (size_t i = 0; i < slice_num; ++i) { + auto str_id = + lookup(*static_cast(data->RawValue(i))); + AssertInfo(valid_str_id(str_id), "invalid marisa key"); + str_ids_[offset++] = str_id; + } + } + + // fill str_ids_to_offsets_ + fill_offsets(); + + built_ = true; +} + void StringIndexMarisa::Build(size_t n, const std::string* values) { if (built_) { @@ -68,15 +134,17 @@ StringIndexMarisa::Serialize(const Config& config) { trie_.write(fd); auto size = get_file_size(fd); - auto buf = new uint8_t[size]; + auto index_data = std::shared_ptr(new uint8_t[size]); - while (read(fd, buf, size) != size) { - lseek(fd, 0, SEEK_SET); - } - std::shared_ptr index_data(buf); + lseek(fd, 0, SEEK_SET); + auto status = read(fd, index_data.get(), size); close(fd); remove(file.c_str()); + if (status != size) { + throw UnistdException("read index from fd error, errorCode is " + + std::to_string(status)); + } auto str_ids_len = str_ids_.size() * sizeof(size_t); std::shared_ptr str_ids(new uint8_t[str_ids_len]); @@ -86,15 +154,28 @@ StringIndexMarisa::Serialize(const Config& config) { res_set.Append(MARISA_TRIE_INDEX, index_data, size); res_set.Append(MARISA_STR_IDS, str_ids, str_ids_len); - milvus::Disassemble(res_set); + Disassemble(res_set); return res_set; } -void -StringIndexMarisa::Load(const BinarySet& set, const Config& config) { - milvus::Assemble(const_cast(set)); +BinarySet +StringIndexMarisa::Upload(const Config& config) { + auto binary_set = Serialize(config); + file_manager_->AddFile(binary_set); + auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize(); + BinarySet ret; + for (auto& file : remote_paths_to_size) { + ret.Append(file.first, nullptr, file.second); + } + + return ret; +} + +void +StringIndexMarisa::LoadWithoutAssemble(const BinarySet& set, + const Config& config) { auto uuid = boost::uuids::random_generator()(); auto uuid_string = boost::uuids::to_string(uuid); auto file = std::string("/tmp/") + uuid_string; @@ -105,8 +186,13 @@ StringIndexMarisa::Load(const BinarySet& set, const Config& config) { auto fd = open( file.c_str(), O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IXUSR); lseek(fd, 0, SEEK_SET); - while (write(fd, index->data.get(), len) != len) { - lseek(fd, 0, SEEK_SET); + + auto status = write(fd, index->data.get(), len); + if (status != len) { + close(fd); + remove(file.c_str()); + throw UnistdException("write index to fd error, errorCode is " + + std::to_string(status)); } lseek(fd, 0, SEEK_SET); @@ -122,9 +208,30 @@ StringIndexMarisa::Load(const BinarySet& set, const Config& config) { fill_offsets(); } -bool -valid_str_id(size_t str_id) { - return str_id >= 0 && str_id != MARISA_INVALID_KEY_ID; +void +StringIndexMarisa::Load(const BinarySet& set, const Config& config) { + milvus::Assemble(const_cast(set)); + LoadWithoutAssemble(set, config); +} + +void +StringIndexMarisa::Load(const Config& config) { + auto index_files = + GetValueFromConfig>(config, "index_files"); + AssertInfo(index_files.has_value(), + "index file paths is empty when load index"); + auto index_datas = file_manager_->LoadIndexToMemory(index_files.value()); + AssembleIndexDatas(index_datas); + BinarySet binary_set; + for (auto& [key, data] : index_datas) { + auto size = data->Size(); + auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction + auto buf = std::shared_ptr( + (uint8_t*)const_cast(data->Data()), deleter); + binary_set.Append(key, buf, size); + } + + LoadWithoutAssemble(binary_set, config); } const TargetBitmap @@ -248,7 +355,7 @@ StringIndexMarisa::fill_str_ids(size_t n, const std::string* values) { for (size_t i = 0; i < n; i++) { auto str = values[i]; auto str_id = lookup(str); - assert(valid_str_id(str_id)); + AssertInfo(valid_str_id(str_id), "invalid marisa key"); str_ids_[i] = str_id; } } diff --git a/internal/core/src/index/StringIndexMarisa.h b/internal/core/src/index/StringIndexMarisa.h index 3d0afedc5e..ca9ee3612e 100644 --- a/internal/core/src/index/StringIndexMarisa.h +++ b/internal/core/src/index/StringIndexMarisa.h @@ -24,12 +24,14 @@ #include #include #include +#include "storage/MemFileManagerImpl.h" namespace milvus::index { class StringIndexMarisa : public StringIndex { public: - StringIndexMarisa() = default; + explicit StringIndexMarisa( + storage::FileManagerImplPtr file_manager = nullptr); int64_t Size() override; @@ -40,6 +42,9 @@ class StringIndexMarisa : public StringIndex { void Load(const BinarySet& set, const Config& config = {}) override; + void + Load(const Config& config = {}) override; + int64_t Count() override { return str_ids_.size(); @@ -48,6 +53,9 @@ class StringIndexMarisa : public StringIndex { void Build(size_t n, const std::string* values) override; + void + Build(const Config& config = {}) override; + const TargetBitmap In(size_t n, const std::string* values) override; @@ -69,6 +77,9 @@ class StringIndexMarisa : public StringIndex { std::string Reverse_Lookup(size_t offset) const override; + BinarySet + Upload(const Config& config = {}) override; + private: void fill_str_ids(size_t n, const std::string* values); @@ -83,19 +94,23 @@ class StringIndexMarisa : public StringIndex { std::vector prefix_match(const std::string_view prefix); + void + LoadWithoutAssemble(const BinarySet& binary_set, const Config& config); + private: Config config_; marisa::Trie trie_; std::vector str_ids_; // used to retrieve. std::map> str_ids_to_offsets_; bool built_ = false; + std::shared_ptr file_manager_; }; using StringIndexMarisaPtr = std::unique_ptr; inline StringIndexPtr -CreateStringIndexMarisa() { - return std::make_unique(); +CreateStringIndexMarisa(storage::FileManagerImplPtr file_manager = nullptr) { + return std::make_unique(file_manager); } } // namespace milvus::index diff --git a/internal/core/src/index/Utils.cpp b/internal/core/src/index/Utils.cpp index 809a6bccf5..7e0da5ec33 100644 --- a/internal/core/src/index/Utils.cpp +++ b/internal/core/src/index/Utils.cpp @@ -25,6 +25,9 @@ #include #include "exceptions/EasyAssert.h" #include "knowhere/comp/index_param.h" +#include "common/Slice.h" +#include "storage/Util.h" + namespace milvus::index { size_t @@ -51,14 +54,6 @@ BIN_List() { return ret; } -std::vector -DISK_LIST() { - static std::vector ret{ - knowhere::IndexEnum::INDEX_DISKANN, - }; - return ret; -} - std::vector> unsupported_index_combinations() { static std::vector> ret{ @@ -78,11 +73,6 @@ is_in_nm_list(const IndexType& index_type) { return is_in_list(index_type, NM_List); } -bool -is_in_disk_list(const IndexType& index_type) { - return is_in_list(index_type, DISK_LIST); -} - bool is_unsupported(const IndexType& index_type, const MetricType& metric_type) { return is_in_list>( @@ -197,4 +187,36 @@ ParseConfigFromIndexParams( return config; } +void +AssembleIndexDatas(std::map& index_datas) { + if (index_datas.find(INDEX_FILE_SLICE_META) != index_datas.end()) { + auto slice_meta = index_datas.at(INDEX_FILE_SLICE_META); + Config meta_data = Config::parse(std::string( + static_cast(slice_meta->Data()), slice_meta->Size())); + + for (auto& item : meta_data[META]) { + std::string prefix = item[NAME]; + int slice_num = item[SLICE_NUM]; + auto total_len = static_cast(item[TOTAL_LEN]); + + auto new_field_data = + storage::CreateFieldData(DataType::INT8, 1, total_len); + + for (auto i = 0; i < slice_num; ++i) { + std::string file_name = GenSlicedFileName(prefix, i); + AssertInfo(index_datas.find(file_name) != index_datas.end(), + "lost index slice data"); + auto data = index_datas.at(file_name); + auto len = data->Size(); + new_field_data->FillFieldData(data->Data(), len); + index_datas.erase(file_name); + } + AssertInfo( + new_field_data->IsFull(), + "index len is inconsistent after disassemble and assemble"); + index_datas[prefix] = new_field_data; + } + } +} + } // namespace milvus::index diff --git a/internal/core/src/index/Utils.h b/internal/core/src/index/Utils.h index 160fec8359..12ac2fef13 100644 --- a/internal/core/src/index/Utils.h +++ b/internal/core/src/index/Utils.h @@ -29,6 +29,7 @@ #include "common/Types.h" #include "index/IndexInfo.h" #include "storage/Types.h" +#include "storage/FieldData.h" namespace milvus::index { @@ -44,22 +45,12 @@ BIN_List(); std::vector> unsupported_index_combinations(); -template -inline bool -is_in_list(const T& t, std::function()> list_func) { - auto l = list_func(); - return std::find(l.begin(), l.end(), t) != l.end(); -} - bool is_in_bin_list(const IndexType& index_type); bool is_in_nm_list(const IndexType& index_type); -bool -is_in_disk_list(const IndexType& index_type); - bool is_unsupported(const IndexType& index_type, const MetricType& metric_type); @@ -118,4 +109,7 @@ Config ParseConfigFromIndexParams( const std::map& index_params); +void +AssembleIndexDatas(std::map& index_datas); + } // namespace milvus::index diff --git a/internal/core/src/index/VectorDiskIndex.cpp b/internal/core/src/index/VectorDiskIndex.cpp index e17fcc87ff..cb93ebac19 100644 --- a/internal/core/src/index/VectorDiskIndex.cpp +++ b/internal/core/src/index/VectorDiskIndex.cpp @@ -20,7 +20,7 @@ #include "config/ConfigKnowhere.h" #include "index/Meta.h" #include "index/Utils.h" -#include "storage/LocalChunkManager.h" +#include "storage/LocalChunkManagerSingleton.h" #include "storage/Util.h" #include "common/Consts.h" #include "common/RangeSearchHelper.h" @@ -42,17 +42,18 @@ VectorDiskAnnIndex::VectorDiskAnnIndex( : VectorIndex(index_type, metric_type) { file_manager_ = std::dynamic_pointer_cast(file_manager); - auto& local_chunk_manager = storage::LocalChunkManager::GetInstance(); + auto local_chunk_manager = + storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager(); auto local_index_path_prefix = file_manager_->GetLocalIndexObjectPrefix(); // As we have guarded dup-load in QueryNode, // this assertion failed only if the Milvus rebooted in the same pod, // need to remove these files then re-load the segment - if (local_chunk_manager.Exist(local_index_path_prefix)) { - local_chunk_manager.RemoveDir(local_index_path_prefix); + if (local_chunk_manager->Exist(local_index_path_prefix)) { + local_chunk_manager->RemoveDir(local_index_path_prefix); } - local_chunk_manager.CreateDir(local_index_path_prefix); + local_chunk_manager->CreateDir(local_index_path_prefix); auto diskann_index_pack = knowhere::Pack(std::shared_ptr(file_manager)); index_ = knowhere::IndexFactory::Instance().Create(GetIndexType(), @@ -63,6 +64,12 @@ template void VectorDiskAnnIndex::Load(const BinarySet& binary_set /* not used */, const Config& config) { + Load(config); +} + +template +void +VectorDiskAnnIndex::Load(const Config& config) { knowhere::Json load_config = update_load_json(config); auto index_files = @@ -80,18 +87,65 @@ VectorDiskAnnIndex::Load(const BinarySet& binary_set /* not used */, SetDim(index_.Dim()); } +template +BinarySet +VectorDiskAnnIndex::Upload(const Config& config) { + auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize(); + BinarySet ret; + for (auto& file : remote_paths_to_size) { + ret.Append(file.first, nullptr, file.second); + } + + return ret; +} + +template +void +VectorDiskAnnIndex::Build(const Config& config) { + auto local_chunk_manager = + storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + knowhere::Json build_config; + build_config.update(config); + + auto segment_id = file_manager_->GetFieldDataMeta().segment_id; + auto insert_files = + GetValueFromConfig>(config, "insert_files"); + AssertInfo(insert_files.has_value(), + "insert file paths is empty when build disk ann index"); + auto local_data_path = + file_manager_->CacheRawDataToDisk(insert_files.value()); + build_config[DISK_ANN_RAW_DATA_PATH] = local_data_path; + + auto local_index_path_prefix = file_manager_->GetLocalIndexObjectPrefix(); + build_config[DISK_ANN_PREFIX_PATH] = local_index_path_prefix; + + auto num_threads = GetValueFromConfig( + build_config, DISK_ANN_BUILD_THREAD_NUM); + AssertInfo(num_threads.has_value(), + "param " + std::string(DISK_ANN_BUILD_THREAD_NUM) + "is empty"); + build_config[DISK_ANN_THREADS_NUM] = std::atoi(num_threads.value().c_str()); + knowhere::DataSet* ds_ptr = nullptr; + build_config.erase("insert_files"); + index_.Build(*ds_ptr, build_config); + + local_chunk_manager->RemoveDir( + storage::GetSegmentRawDataPathPrefix(local_chunk_manager, segment_id)); +} + template void VectorDiskAnnIndex::BuildWithDataset(const DatasetPtr& dataset, const Config& config) { - auto& local_chunk_manager = storage::LocalChunkManager::GetInstance(); + auto local_chunk_manager = + storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager(); knowhere::Json build_config; build_config.update(config); // set data path - auto segment_id = file_manager_->GetFileDataMeta().segment_id; - auto field_id = file_manager_->GetFileDataMeta().field_id; - auto local_data_path = - storage::GenFieldRawDataPathPrefix(segment_id, field_id) + "raw_data"; + auto segment_id = file_manager_->GetFieldDataMeta().segment_id; + auto field_id = file_manager_->GetFieldDataMeta().field_id; + auto local_data_path = storage::GenFieldRawDataPathPrefix( + local_chunk_manager, segment_id, field_id) + + "raw_data"; build_config[DISK_ANN_RAW_DATA_PATH] = local_data_path; auto local_index_path_prefix = file_manager_->GetLocalIndexObjectPrefix(); @@ -103,30 +157,31 @@ VectorDiskAnnIndex::BuildWithDataset(const DatasetPtr& dataset, "param " + std::string(DISK_ANN_BUILD_THREAD_NUM) + "is empty"); build_config[DISK_ANN_THREADS_NUM] = std::atoi(num_threads.value().c_str()); - if (!local_chunk_manager.Exist(local_data_path)) { - local_chunk_manager.CreateFile(local_data_path); + if (!local_chunk_manager->Exist(local_data_path)) { + local_chunk_manager->CreateFile(local_data_path); } int64_t offset = 0; auto num = uint32_t(milvus::GetDatasetRows(dataset)); - local_chunk_manager.Write(local_data_path, offset, &num, sizeof(num)); + local_chunk_manager->Write(local_data_path, offset, &num, sizeof(num)); offset += sizeof(num); auto dim = uint32_t(milvus::GetDatasetDim(dataset)); - local_chunk_manager.Write(local_data_path, offset, &dim, sizeof(dim)); + local_chunk_manager->Write(local_data_path, offset, &dim, sizeof(dim)); offset += sizeof(dim); auto data_size = num * dim * sizeof(float); auto raw_data = const_cast(milvus::GetDatasetTensor(dataset)); - local_chunk_manager.Write(local_data_path, offset, raw_data, data_size); + local_chunk_manager->Write(local_data_path, offset, raw_data, data_size); knowhere::DataSet* ds_ptr = nullptr; auto stat = index_.Build(*ds_ptr, build_config); if (stat != knowhere::Status::success) PanicCodeInfo(ErrorCodeEnum::BuildIndexError, "failed to build index, " + MatchKnowhereError(stat)); - local_chunk_manager.RemoveDir( - storage::GetSegmentRawDataPathPrefix(segment_id)); + local_chunk_manager->RemoveDir( + storage::GetSegmentRawDataPathPrefix(local_chunk_manager, segment_id)); + // TODO :: // SetDim(index_->Dim()); } @@ -263,9 +318,11 @@ VectorDiskAnnIndex::GetVector(const DatasetPtr dataset) const { template void VectorDiskAnnIndex::CleanLocalData() { - auto& local_chunk_manager = storage::LocalChunkManager::GetInstance(); - local_chunk_manager.RemoveDir(file_manager_->GetLocalIndexObjectPrefix()); - local_chunk_manager.RemoveDir(file_manager_->GetLocalRawDataObjectPrefix()); + auto local_chunk_manager = + storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + local_chunk_manager->RemoveDir(file_manager_->GetLocalIndexObjectPrefix()); + local_chunk_manager->RemoveDir( + file_manager_->GetLocalRawDataObjectPrefix()); } template diff --git a/internal/core/src/index/VectorDiskIndex.h b/internal/core/src/index/VectorDiskIndex.h index 3a6e3650e5..8bd3aac480 100644 --- a/internal/core/src/index/VectorDiskIndex.h +++ b/internal/core/src/index/VectorDiskIndex.h @@ -33,7 +33,7 @@ class VectorDiskAnnIndex : public VectorIndex { const MetricType& metric_type, storage::FileManagerImplPtr file_manager); BinarySet - Serialize(const Config& config) override { + Serialize(const Config& config) override { // deprecated auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize(); BinarySet binary_set; for (auto& file : remote_paths_to_size) { @@ -43,6 +43,9 @@ class VectorDiskAnnIndex : public VectorIndex { return binary_set; } + BinarySet + Upload(const Config& config = {}) override; + int64_t Count() override { return index_.Count(); @@ -52,10 +55,16 @@ class VectorDiskAnnIndex : public VectorIndex { Load(const BinarySet& binary_set /* not used */, const Config& config = {}) override; + void + Load(const Config& config = {}) override; + void BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) override; + void + Build(const Config& config = {}) override; + std::unique_ptr Query(const DatasetPtr dataset, const SearchInfo& search_info, diff --git a/internal/core/src/index/VectorMemIndex.cpp b/internal/core/src/index/VectorMemIndex.cpp index c9f5964209..0dce6002f3 100644 --- a/internal/core/src/index/VectorMemIndex.cpp +++ b/internal/core/src/index/VectorMemIndex.cpp @@ -33,14 +33,32 @@ namespace milvus::index { VectorMemIndex::VectorMemIndex(const IndexType& index_type, - const MetricType& metric_type) + const MetricType& metric_type, + storage::FileManagerImplPtr file_manager) : VectorIndex(index_type, metric_type) { AssertInfo(!is_unsupported(index_type, metric_type), index_type + " doesn't support metric: " + metric_type); - + if (file_manager != nullptr) { + file_manager_ = std::dynamic_pointer_cast( + file_manager); + } index_ = knowhere::IndexFactory::Instance().Create(GetIndexType()); } +BinarySet +VectorMemIndex::Upload(const Config& config) { + auto binary_set = Serialize(config); + file_manager_->AddFile(binary_set); + + auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize(); + BinarySet ret; + for (auto& file : remote_paths_to_size) { + ret.Append(file.first, nullptr, file.second); + } + + return ret; +} + BinarySet VectorMemIndex::Serialize(const Config& config) { knowhere::BinarySet ret; @@ -48,14 +66,14 @@ VectorMemIndex::Serialize(const Config& config) { if (stat != knowhere::Status::success) PanicCodeInfo(ErrorCodeEnum::UnexpectedError, "failed to serialize index, " + MatchKnowhereError(stat)); - milvus::Disassemble(ret); + Disassemble(ret); return ret; } void -VectorMemIndex::Load(const BinarySet& binary_set, const Config& config) { - milvus::Assemble(const_cast(binary_set)); +VectorMemIndex::LoadWithoutAssemble(const BinarySet& binary_set, + const Config& config) { auto stat = index_.Deserialize(binary_set); if (stat != knowhere::Status::success) PanicCodeInfo( @@ -64,6 +82,31 @@ VectorMemIndex::Load(const BinarySet& binary_set, const Config& config) { SetDim(index_.Dim()); } +void +VectorMemIndex::Load(const BinarySet& binary_set, const Config& config) { + milvus::Assemble(const_cast(binary_set)); + LoadWithoutAssemble(binary_set, config); +} + +void +VectorMemIndex::Load(const Config& config) { + auto index_files = + GetValueFromConfig>(config, "index_files"); + AssertInfo(index_files.has_value(), + "index file paths is empty when load index"); + auto index_datas = file_manager_->LoadIndexToMemory(index_files.value()); + AssembleIndexDatas(index_datas); + BinarySet binary_set; + for (auto& [key, data] : index_datas) { + auto size = data->Size(); + auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction + auto buf = std::shared_ptr( + (uint8_t*)const_cast(data->Data()), deleter); + binary_set.Append(key, buf, size); + } + LoadWithoutAssemble(binary_set, config); +} + void VectorMemIndex::BuildWithDataset(const DatasetPtr& dataset, const Config& config) { @@ -81,6 +124,43 @@ VectorMemIndex::BuildWithDataset(const DatasetPtr& dataset, SetDim(index_.Dim()); } +void +VectorMemIndex::Build(const Config& config) { + auto insert_files = + GetValueFromConfig>(config, "insert_files"); + AssertInfo(insert_files.has_value(), + "insert file paths is empty when build disk ann index"); + auto field_datas = + file_manager_->CacheRawDataToMemory(insert_files.value()); + + int64_t total_size = 0; + int64_t total_num_rows = 0; + int64_t dim = 0; + for (auto data : field_datas) { + total_size += data->Size(); + total_num_rows += data->get_num_rows(); + AssertInfo(dim == 0 || dim == data->get_dim(), + "inconsistent dim value between field datas!"); + dim = data->get_dim(); + } + + auto buf = std::shared_ptr(new uint8_t[total_size]); + int64_t offset = 0; + for (auto data : field_datas) { + std::memcpy(buf.get() + offset, data->Data(), data->Size()); + offset += data->Size(); + data.reset(); + } + field_datas.clear(); + + Config build_config; + build_config.update(config); + build_config.erase("insert_files"); + + auto dataset = GenDataset(total_num_rows, dim, buf.get()); + BuildWithDataset(dataset, build_config); +} + void VectorMemIndex::AddWithDataset(const DatasetPtr& dataset, const Config& config) { diff --git a/internal/core/src/index/VectorMemIndex.h b/internal/core/src/index/VectorMemIndex.h index e0b0ea4141..acf082bae2 100644 --- a/internal/core/src/index/VectorMemIndex.h +++ b/internal/core/src/index/VectorMemIndex.h @@ -23,13 +23,15 @@ #include #include "knowhere/factory.h" #include "index/VectorIndex.h" +#include "storage/MemFileManagerImpl.h" namespace milvus::index { class VectorMemIndex : public VectorIndex { public: explicit VectorMemIndex(const IndexType& index_type, - const MetricType& metric_type); + const MetricType& metric_type, + storage::FileManagerImplPtr file_manager = nullptr); BinarySet Serialize(const Config& config) override; @@ -37,10 +39,16 @@ class VectorMemIndex : public VectorIndex { void Load(const BinarySet& binary_set, const Config& config = {}) override; + void + Load(const Config& config = {}) override; + void BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) override; + void + Build(const Config& config = {}) override; + void AddWithDataset(const DatasetPtr& dataset, const Config& config) override; @@ -60,9 +68,17 @@ class VectorMemIndex : public VectorIndex { const std::vector GetVector(const DatasetPtr dataset) const override; + BinarySet + Upload(const Config& config = {}) override; + + protected: + virtual void + LoadWithoutAssemble(const BinarySet& binary_set, const Config& config); + protected: Config config_; knowhere::Index index_; + std::shared_ptr file_manager_; }; using VectorMemIndexPtr = std::unique_ptr; diff --git a/internal/core/src/index/VectorMemNMIndex.cpp b/internal/core/src/index/VectorMemNMIndex.cpp index 6d165cd039..e7890b5d52 100644 --- a/internal/core/src/index/VectorMemNMIndex.cpp +++ b/internal/core/src/index/VectorMemNMIndex.cpp @@ -38,7 +38,7 @@ VectorMemNMIndex::Serialize(const Config& config) { auto raw_data = std::shared_ptr( static_cast(raw_data_.data()), deleter); ret.Append(RAW_DATA, raw_data, raw_data_.size()); - milvus::Disassemble(ret); + Disassemble(ret); return ret; } @@ -52,6 +52,17 @@ VectorMemNMIndex::BuildWithDataset(const DatasetPtr& dataset, rc.ElapseFromBegin("Done"); } +void +VectorMemNMIndex::LoadWithoutAssemble(const BinarySet& binary_set, + const Config& config) { + VectorMemIndex::LoadWithoutAssemble(binary_set, config); + if (binary_set.Contains(RAW_DATA)) { + std::call_once(raw_data_loaded_, [&]() { + LOG_SEGCORE_INFO_ << "NM index load raw data done!"; + }); + } +} + void VectorMemNMIndex::AddWithDataset(const DatasetPtr& /*dataset*/, const Config& /*config*/) { diff --git a/internal/core/src/index/VectorMemNMIndex.h b/internal/core/src/index/VectorMemNMIndex.h index b986feedd1..56ff474e09 100644 --- a/internal/core/src/index/VectorMemNMIndex.h +++ b/internal/core/src/index/VectorMemNMIndex.h @@ -28,9 +28,11 @@ namespace milvus::index { class VectorMemNMIndex : public VectorMemIndex { public: - explicit VectorMemNMIndex(const IndexType& index_type, - const MetricType& metric_type) - : VectorMemIndex(index_type, metric_type) { + explicit VectorMemNMIndex( + const IndexType& index_type, + const MetricType& metric_type, + storage::FileManagerImplPtr file_manager = nullptr) + : VectorMemIndex(index_type, metric_type, file_manager) { AssertInfo(is_in_nm_list(index_type), "not valid nm index type"); } @@ -52,6 +54,10 @@ class VectorMemNMIndex : public VectorMemIndex { const SearchInfo& search_info, const BitsetView& bitset) override; + void + LoadWithoutAssemble(const BinarySet& binary_set, + const Config& config) override; + private: void store_raw_data(const DatasetPtr& dataset); diff --git a/internal/core/src/indexbuilder/IndexCreatorBase.h b/internal/core/src/indexbuilder/IndexCreatorBase.h index 15ce6e8a5c..b6a2ac44b2 100644 --- a/internal/core/src/indexbuilder/IndexCreatorBase.h +++ b/internal/core/src/indexbuilder/IndexCreatorBase.h @@ -13,6 +13,7 @@ #include #include "common/Types.h" +#include "storage/FileManager.h" namespace milvus::indexbuilder { class IndexCreatorBase { @@ -22,12 +23,18 @@ class IndexCreatorBase { virtual void Build(const milvus::DatasetPtr& dataset) = 0; + virtual void + Build() = 0; + virtual milvus::BinarySet Serialize() = 0; // used for test. virtual void Load(const milvus::BinarySet&) = 0; + + virtual BinarySet + Upload() = 0; }; using IndexCreatorBasePtr = std::unique_ptr; diff --git a/internal/core/src/indexbuilder/IndexFactory.h b/internal/core/src/indexbuilder/IndexFactory.h index 132b229279..3f2538d1df 100644 --- a/internal/core/src/indexbuilder/IndexFactory.h +++ b/internal/core/src/indexbuilder/IndexFactory.h @@ -13,13 +13,15 @@ #include #include +#include +#include + #include "indexbuilder/IndexCreatorBase.h" #include "indexbuilder/ScalarIndexCreator.h" #include "indexbuilder/VecIndexCreator.h" #include "indexbuilder/type_c.h" #include "storage/Types.h" -#include -#include +#include "storage/FileManager.h" namespace milvus::indexbuilder { @@ -40,15 +42,13 @@ class IndexFactory { } IndexCreatorBasePtr - CreateIndex(CDataType dtype, - const char* type_params, - const char* index_params, - const storage::StorageConfig& storage_config) { - auto real_dtype = DataType(dtype); - auto invalid_dtype_msg = std::string("invalid data type: ") + - std::to_string(int(real_dtype)); + CreateIndex(DataType type, + Config& config, + storage::FileManagerImplPtr file_manager) { + auto invalid_dtype_msg = + std::string("invalid data type: ") + std::to_string(int(type)); - switch (real_dtype) { + switch (type) { case DataType::BOOL: case DataType::INT8: case DataType::INT16: @@ -58,12 +58,12 @@ class IndexFactory { case DataType::DOUBLE: case DataType::VARCHAR: case DataType::STRING: - return CreateScalarIndex(real_dtype, type_params, index_params); + return CreateScalarIndex(type, config, file_manager); case DataType::VECTOR_FLOAT: case DataType::VECTOR_BINARY: return std::make_unique( - real_dtype, type_params, index_params, storage_config); + type, config, file_manager); default: throw std::invalid_argument(invalid_dtype_msg); } diff --git a/internal/core/src/indexbuilder/ScalarIndexCreator.cpp b/internal/core/src/indexbuilder/ScalarIndexCreator.cpp index 6f3f5d329c..7d4eefc7f9 100644 --- a/internal/core/src/indexbuilder/ScalarIndexCreator.cpp +++ b/internal/core/src/indexbuilder/ScalarIndexCreator.cpp @@ -21,30 +21,14 @@ namespace milvus::indexbuilder { ScalarIndexCreator::ScalarIndexCreator(DataType dtype, - const char* type_params, - const char* index_params) - : dtype_(dtype) { - // TODO: move parse-related logic to a common interface. - proto::indexcgo::TypeParams type_params_; - proto::indexcgo::IndexParams index_params_; - milvus::index::ParseFromString(type_params_, std::string(type_params)); - milvus::index::ParseFromString(index_params_, std::string(index_params)); - - for (auto i = 0; i < type_params_.params_size(); ++i) { - const auto& param = type_params_.params(i); - config_[param.key()] = param.value(); - } - - for (auto i = 0; i < index_params_.params_size(); ++i) { - const auto& param = index_params_.params(i); - config_[param.key()] = param.value(); - } - + Config& config, + storage::FileManagerImplPtr file_manager) + : dtype_(dtype), config_(config) { milvus::index::CreateIndexInfo index_info; index_info.field_type = dtype_; index_info.index_type = index_type(); - index_ = - index::IndexFactory::GetInstance().CreateIndex(index_info, nullptr); + index_ = index::IndexFactory::GetInstance().CreateIndex(index_info, + file_manager); } void @@ -54,6 +38,11 @@ ScalarIndexCreator::Build(const milvus::DatasetPtr& dataset) { index_->BuildWithRawData(size, data); } +void +ScalarIndexCreator::Build() { + index_->Build(config_); +} + milvus::BinarySet ScalarIndexCreator::Serialize() { return index_->Serialize(config_); @@ -70,4 +59,9 @@ ScalarIndexCreator::index_type() { return "sort"; } +BinarySet +ScalarIndexCreator::Upload() { + return index_->Upload(); +} + } // namespace milvus::indexbuilder diff --git a/internal/core/src/indexbuilder/ScalarIndexCreator.h b/internal/core/src/indexbuilder/ScalarIndexCreator.h index 9d4feb246c..9d6fe9214c 100644 --- a/internal/core/src/indexbuilder/ScalarIndexCreator.h +++ b/internal/core/src/indexbuilder/ScalarIndexCreator.h @@ -23,18 +23,24 @@ namespace milvus::indexbuilder { class ScalarIndexCreator : public IndexCreatorBase { public: ScalarIndexCreator(DataType data_type, - const char* type_params, - const char* index_params); + Config& config, + storage::FileManagerImplPtr file_manager); void Build(const milvus::DatasetPtr& dataset) override; + void + Build() override; + milvus::BinarySet Serialize() override; void Load(const milvus::BinarySet&) override; + BinarySet + Upload() override; + private: std::string index_type(); @@ -49,10 +55,9 @@ using ScalarIndexCreatorPtr = std::unique_ptr; inline ScalarIndexCreatorPtr CreateScalarIndex(DataType dtype, - const char* type_params, - const char* index_params) { - return std::make_unique( - dtype, type_params, index_params); + Config& config, + storage::FileManagerImplPtr file_manager) { + return std::make_unique(dtype, config, file_manager); } } // namespace milvus::indexbuilder diff --git a/internal/core/src/indexbuilder/VecIndexCreator.cpp b/internal/core/src/indexbuilder/VecIndexCreator.cpp index 18c2eace60..962ff7f0b9 100644 --- a/internal/core/src/indexbuilder/VecIndexCreator.cpp +++ b/internal/core/src/indexbuilder/VecIndexCreator.cpp @@ -17,50 +17,17 @@ #include "index/IndexFactory.h" #include "pb/index_cgo_msg.pb.h" -#ifdef BUILD_DISK_ANN -#include "storage/DiskFileManagerImpl.h" -#endif - namespace milvus::indexbuilder { VecIndexCreator::VecIndexCreator(DataType data_type, - const char* serialized_type_params, - const char* serialized_index_params, - const storage::StorageConfig& storage_config) - : data_type_(data_type) { - proto::indexcgo::TypeParams type_params_; - proto::indexcgo::IndexParams index_params_; - milvus::index::ParseFromString(type_params_, - std::string(serialized_type_params)); - milvus::index::ParseFromString(index_params_, - std::string(serialized_index_params)); - - for (auto i = 0; i < type_params_.params_size(); ++i) { - const auto& param = type_params_.params(i); - config_[param.key()] = param.value(); - } - - for (auto i = 0; i < index_params_.params_size(); ++i) { - const auto& param = index_params_.params(i); - config_[param.key()] = param.value(); - } - + Config& config, + storage::FileManagerImplPtr file_manager) + : data_type_(data_type), config_(config) { index::CreateIndexInfo index_info; index_info.field_type = data_type_; index_info.index_type = index::GetIndexTypeFromConfig(config_); index_info.metric_type = index::GetMetricTypeFromConfig(config_); - std::shared_ptr file_manager = nullptr; -#ifdef BUILD_DISK_ANN - if (index::is_in_disk_list(index_info.index_type)) { - // For now, only support diskann index - file_manager = std::make_shared( - index::GetFieldDataMetaFromConfig(config_), - index::GetIndexMetaFromConfig(config_), - storage_config); - } -#endif - index_ = index::IndexFactory::GetInstance().CreateIndex(index_info, file_manager); AssertInfo(index_ != nullptr, @@ -77,6 +44,11 @@ VecIndexCreator::Build(const milvus::DatasetPtr& dataset) { index_->BuildWithDataset(dataset, config_); } +void +VecIndexCreator::Build() { + index_->Build(config_); +} + milvus::BinarySet VecIndexCreator::Serialize() { return index_->Serialize(config_); @@ -95,6 +67,11 @@ VecIndexCreator::Query(const milvus::DatasetPtr& dataset, return vector_index->Query(dataset, search_info, bitset); } +BinarySet +VecIndexCreator::Upload() { + return index_->Upload(); +} + void VecIndexCreator::CleanLocalData() { auto vector_index = dynamic_cast(index_.get()); diff --git a/internal/core/src/indexbuilder/VecIndexCreator.h b/internal/core/src/indexbuilder/VecIndexCreator.h index a78a3f5f49..13aef5f477 100644 --- a/internal/core/src/indexbuilder/VecIndexCreator.h +++ b/internal/core/src/indexbuilder/VecIndexCreator.h @@ -27,13 +27,15 @@ namespace milvus::indexbuilder { class VecIndexCreator : public IndexCreatorBase { public: explicit VecIndexCreator(DataType data_type, - const char* serialized_type_params, - const char* serialized_index_params, - const storage::StorageConfig& storage_config); + Config& config, + storage::FileManagerImplPtr file_manager); void Build(const milvus::DatasetPtr& dataset) override; + void + Build() override; + milvus::BinarySet Serialize() override; @@ -48,6 +50,9 @@ class VecIndexCreator : public IndexCreatorBase { const SearchInfo& search_info, const BitsetView& bitset); + BinarySet + Upload() override; + public: void CleanLocalData(); diff --git a/internal/core/src/indexbuilder/index_c.cpp b/internal/core/src/indexbuilder/index_c.cpp index 9342863eb4..23c0459d02 100644 --- a/internal/core/src/indexbuilder/index_c.cpp +++ b/internal/core/src/indexbuilder/index_c.cpp @@ -21,41 +21,40 @@ #include "indexbuilder/IndexFactory.h" #include "common/type_c.h" #include "storage/Types.h" +#include "indexbuilder/types.h" +#include "index/Utils.h" +#include "pb/index_cgo_msg.pb.h" +#include "storage/Util.h" CStatus CreateIndex(enum CDataType dtype, const char* serialized_type_params, const char* serialized_index_params, - CIndex* res_index, - CStorageConfig c_storage_config) { + CIndex* res_index) { auto status = CStatus(); try { AssertInfo(res_index, "failed to create index, passed index was null"); - std::string address(c_storage_config.address); - std::string bucket_name(c_storage_config.bucket_name); - std::string access_key(c_storage_config.access_key_id); - std::string access_value(c_storage_config.access_key_value); - std::string remote_root_path(c_storage_config.remote_root_path); - std::string storage_type(c_storage_config.storage_type); - std::string iam_endpoint(c_storage_config.iam_endpoint); - auto storage_config = - milvus::storage::StorageConfig{address, - bucket_name, - access_key, - access_value, - remote_root_path, - storage_type, - iam_endpoint, - c_storage_config.useSSL, - c_storage_config.useIAM}; + milvus::proto::indexcgo::TypeParams type_params; + milvus::proto::indexcgo::IndexParams index_params; + milvus::index::ParseFromString(type_params, serialized_type_params); + milvus::index::ParseFromString(index_params, serialized_index_params); + milvus::Config config; + for (auto i = 0; i < type_params.params_size(); ++i) { + const auto& param = type_params.params(i); + config[param.key()] = param.value(); + } + + for (auto i = 0; i < index_params.params_size(); ++i) { + const auto& param = index_params.params(i); + config[param.key()] = param.value(); + } + + auto& index_factory = milvus::indexbuilder::IndexFactory::GetInstance(); auto index = - milvus::indexbuilder::IndexFactory::GetInstance().CreateIndex( - dtype, - serialized_type_params, - serialized_index_params, - storage_config); + index_factory.CreateIndex(milvus::DataType(dtype), config, nullptr); + *res_index = index.release(); status.error_code = Success; status.error_msg = ""; @@ -66,6 +65,65 @@ CreateIndex(enum CDataType dtype, return status; } +CStatus +CreateIndexV2(CIndex* res_index, CBuildIndexInfo c_build_index_info) { + try { + auto build_index_info = (BuildIndexInfo*)c_build_index_info; + auto field_type = build_index_info->field_type; + + milvus::index::CreateIndexInfo index_info; + index_info.field_type = build_index_info->field_type; + + auto& config = build_index_info->config; + config["insert_files"] = build_index_info->insert_files; + + // get index type + auto index_type = milvus::index::GetValueFromConfig( + config, "index_type"); + AssertInfo(index_type.has_value(), "index type is empty"); + index_info.index_type = index_type.value(); + + // get metric type + if (milvus::datatype_is_vector(field_type)) { + auto metric_type = milvus::index::GetValueFromConfig( + config, "metric_type"); + AssertInfo(metric_type.has_value(), "metric type is empty"); + index_info.metric_type = metric_type.value(); + } + + // init file manager + milvus::storage::FieldDataMeta field_meta{ + build_index_info->collection_id, + build_index_info->partition_id, + build_index_info->segment_id, + build_index_info->field_id}; + milvus::storage::IndexMeta index_meta{build_index_info->segment_id, + build_index_info->field_id, + build_index_info->index_build_id, + build_index_info->index_version}; + auto chunk_manager = milvus::storage::CreateChunkManager( + build_index_info->storage_config); + auto file_manager = milvus::storage::CreateFileManager( + index_info.index_type, field_meta, index_meta, chunk_manager); + AssertInfo(file_manager != nullptr, "create file manager failed!"); + + auto index = + milvus::indexbuilder::IndexFactory::GetInstance().CreateIndex( + build_index_info->field_type, config, file_manager); + index->Build(); + *res_index = index.release(); + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + CStatus DeleteIndex(CIndex index) { auto status = CStatus(); @@ -219,3 +277,187 @@ CleanLocalData(CIndex index) { } return status; } + +CStatus +NewBuildIndexInfo(CBuildIndexInfo* c_build_index_info, + CStorageConfig c_storage_config) { + try { + auto build_index_info = std::make_unique(); + auto& storage_config = build_index_info->storage_config; + storage_config.address = std::string(c_storage_config.address); + storage_config.bucket_name = std::string(c_storage_config.bucket_name); + storage_config.access_key_id = + std::string(c_storage_config.access_key_id); + storage_config.access_key_value = + std::string(c_storage_config.access_key_value); + storage_config.root_path = std::string(c_storage_config.root_path); + storage_config.storage_type = + std::string(c_storage_config.storage_type); + storage_config.iam_endpoint = + std::string(c_storage_config.iam_endpoint); + storage_config.useSSL = c_storage_config.useSSL; + storage_config.useIAM = c_storage_config.useIAM; + + *c_build_index_info = build_index_info.release(); + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + +void +DeleteBuildIndexInfo(CBuildIndexInfo c_build_index_info) { + auto info = (BuildIndexInfo*)c_build_index_info; + delete info; +} + +CStatus +AppendBuildIndexParam(CBuildIndexInfo c_build_index_info, + const uint8_t* serialized_index_params, + const uint64_t len) { + try { + auto build_index_info = (BuildIndexInfo*)c_build_index_info; + auto index_params = + std::make_unique(); + auto res = index_params->ParseFromArray(serialized_index_params, len); + AssertInfo(res, "Unmarshall index params failed"); + for (auto i = 0; i < index_params->params_size(); ++i) { + const auto& param = index_params->params(i); + build_index_info->config[param.key()] = param.value(); + } + + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + +CStatus +AppendBuildTypeParam(CBuildIndexInfo c_build_index_info, + const uint8_t* serialized_type_params, + const uint64_t len) { + try { + auto build_index_info = (BuildIndexInfo*)c_build_index_info; + auto type_params = + std::make_unique(); + auto res = type_params->ParseFromArray(serialized_type_params, len); + AssertInfo(res, "Unmarshall index build type params failed"); + for (auto i = 0; i < type_params->params_size(); ++i) { + const auto& param = type_params->params(i); + build_index_info->config[param.key()] = param.value(); + } + + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + +CStatus +AppendFieldMetaInfo(CBuildIndexInfo c_build_index_info, + int64_t collection_id, + int64_t partition_id, + int64_t segment_id, + int64_t field_id, + enum CDataType field_type) { + try { + auto build_index_info = (BuildIndexInfo*)c_build_index_info; + build_index_info->collection_id = collection_id; + build_index_info->partition_id = partition_id; + build_index_info->segment_id = segment_id; + build_index_info->field_id = field_id; + build_index_info->field_type = milvus::DataType(field_type); + + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + +CStatus +AppendIndexMetaInfo(CBuildIndexInfo c_build_index_info, + int64_t index_id, + int64_t build_id, + int64_t version) { + try { + auto build_index_info = (BuildIndexInfo*)c_build_index_info; + build_index_info->index_id = index_id; + build_index_info->index_build_id = build_id; + build_index_info->index_version = version; + + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + +CStatus +AppendInsertFilePath(CBuildIndexInfo c_build_index_info, + const char* c_file_path) { + try { + auto build_index_info = (BuildIndexInfo*)c_build_index_info; + std::string insert_file_path(c_file_path); + build_index_info->insert_files.emplace_back(insert_file_path); + + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + +CStatus +SerializeIndexAndUpLoad(CIndex index, CBinarySet* c_binary_set) { + auto status = CStatus(); + try { + AssertInfo( + index, + "failed to serialize index to binary set, passed index was null"); + auto real_index = + reinterpret_cast(index); + auto binary = + std::make_unique(real_index->Upload()); + *c_binary_set = binary.release(); + status.error_code = Success; + status.error_msg = ""; + } catch (std::exception& e) { + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + } + return status; +} diff --git a/internal/core/src/indexbuilder/index_c.h b/internal/core/src/indexbuilder/index_c.h index ac3554b60b..c5ca784c7b 100644 --- a/internal/core/src/indexbuilder/index_c.h +++ b/internal/core/src/indexbuilder/index_c.h @@ -24,8 +24,7 @@ CStatus CreateIndex(enum CDataType dtype, const char* serialized_type_params, const char* serialized_index_params, - CIndex* res_index, - CStorageConfig storage_config); + CIndex* res_index); CStatus DeleteIndex(CIndex index); @@ -53,6 +52,46 @@ LoadIndexFromBinarySet(CIndex index, CBinarySet c_binary_set); CStatus CleanLocalData(CIndex index); +CStatus +NewBuildIndexInfo(CBuildIndexInfo* c_build_index_info, + CStorageConfig c_storage_config); + +void +DeleteBuildIndexInfo(CBuildIndexInfo c_build_index_info); + +CStatus +AppendBuildIndexParam(CBuildIndexInfo c_build_index_info, + const uint8_t* serialized_type_params, + const uint64_t len); + +CStatus +AppendBuildTypeParam(CBuildIndexInfo c_build_index_info, + const uint8_t* serialized_type_params, + const uint64_t len); + +CStatus +AppendFieldMetaInfo(CBuildIndexInfo c_build_index_info, + int64_t collection_id, + int64_t partition_id, + int64_t segment_id, + int64_t field_id, + enum CDataType field_type); + +CStatus +AppendIndexMetaInfo(CBuildIndexInfo c_build_index_info, + int64_t index_id, + int64_t build_id, + int64_t version); + +CStatus +AppendInsertFilePath(CBuildIndexInfo c_build_index_info, const char* file_path); + +CStatus +CreateIndexV2(CIndex* res_index, CBuildIndexInfo c_build_index_info); + +CStatus +SerializeIndexAndUpLoad(CIndex index, CBinarySet* c_binary_set); + #ifdef __cplusplus }; #endif diff --git a/internal/core/src/indexbuilder/type_c.h b/internal/core/src/indexbuilder/type_c.h index 61001a8775..f0d07fb51d 100644 --- a/internal/core/src/indexbuilder/type_c.h +++ b/internal/core/src/indexbuilder/type_c.h @@ -15,3 +15,4 @@ typedef void* CIndex; typedef void* CIndexQueryResult; +typedef void* CBuildIndexInfo; diff --git a/internal/core/src/storage/FieldDataFactory.h b/internal/core/src/indexbuilder/types.h similarity index 57% rename from internal/core/src/storage/FieldDataFactory.h rename to internal/core/src/indexbuilder/types.h index ec462b8834..2301d0ae78 100644 --- a/internal/core/src/storage/FieldDataFactory.h +++ b/internal/core/src/indexbuilder/types.h @@ -14,35 +14,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once - +#include #include +#include +#include "common/Types.h" +#include "index/Index.h" +#include "storage/Types.h" -#include "storage/FieldData.h" - -namespace milvus::storage { - -class FieldDataFactory { - private: - FieldDataFactory() = default; - FieldDataFactory(const FieldDataFactory&) = delete; - FieldDataFactory - operator=(const FieldDataFactory&) = delete; - - public: - static FieldDataFactory& - GetInstance() { - static FieldDataFactory inst; - return inst; - } - - std::string - GetName() const { - return "FieldDataFactory"; - } - - FieldDataPtr - CreateFieldData(const DataType& type, const int64_t dim = 1); -}; - -} // namespace milvus::storage +struct BuildIndexInfo { + int64_t collection_id; + int64_t partition_id; + int64_t segment_id; + int64_t field_id; + milvus::DataType field_type; + int64_t index_id; + int64_t index_build_id; + int64_t index_version; + std::vector insert_files; + milvus::storage::StorageConfig storage_config; + milvus::Config config; +}; \ No newline at end of file diff --git a/internal/core/src/common/Column.h b/internal/core/src/mmap/Column.h similarity index 72% rename from internal/core/src/common/Column.h rename to internal/core/src/mmap/Column.h index ff6d99c687..f0e0ff76d7 100644 --- a/internal/core/src/common/Column.h +++ b/internal/core/src/mmap/Column.h @@ -1,14 +1,18 @@ -// Copyright (C) 2019-2020 Zilliz. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // -// Unless required by applicable law or agreed to in writing, software distributed under the License -// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -// or implied. See the License for the specific language governing permissions and limitations under the License - +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #pragma once #include @@ -21,17 +25,9 @@ #include #include -#include "common/FieldMeta.h" -#include "common/LoadInfo.h" -#include "common/Span.h" -#include "common/Types.h" -#include "common/Utils.h" -#include "exceptions/EasyAssert.h" -#include "fmt/core.h" -#include "log/Log.h" -#include "nlohmann/json.hpp" +#include "mmap/Utils.h" -namespace milvus::segcore { +namespace milvus { struct Entry { char* data; @@ -79,7 +75,7 @@ class Column : public ColumnBase { public: Column(int64_t segment_id, const FieldMeta& field_meta, - const LoadFieldDataInfo& info) { + const FieldDataInfo& info) { data_ = static_cast(CreateMap(segment_id, field_meta, info)); size_ = field_meta.get_sizeof() * info.row_count; row_count_ = info.row_count; @@ -109,20 +105,13 @@ class VariableColumn : public ColumnBase { VariableColumn(int64_t segment_id, const FieldMeta& field_meta, - const LoadFieldDataInfo& info) { - auto begin = FIELD_DATA(info.field_data, string).begin(); - auto end = FIELD_DATA(info.field_data, string).end(); - if constexpr (std::is_same_v) { - begin = FIELD_DATA(info.field_data, json).begin(); - end = FIELD_DATA(info.field_data, json).end(); - } - - size_ = 0; + const FieldDataInfo& info) { indices_.reserve(info.row_count); - while (begin != end) { - indices_.push_back(size_); - size_ += begin->length(); - begin++; + for (auto data : info.datas) { + for (ssize_t idx = 0; idx < data->get_num_rows(); ++idx) { + indices_.emplace_back(size_); + size_ += data->Size(idx); + } } data_ = static_cast(CreateMap(segment_id, field_meta, info)); @@ -177,4 +166,4 @@ class VariableColumn : public ColumnBase { // Compatible with current Span type std::vector views_{}; }; -} // namespace milvus::segcore +} // namespace milvus diff --git a/internal/core/src/config/ConfigChunkManager.h b/internal/core/src/mmap/Types.h similarity index 76% rename from internal/core/src/config/ConfigChunkManager.h rename to internal/core/src/mmap/Types.h index f0cc57b858..7f540481ed 100644 --- a/internal/core/src/config/ConfigChunkManager.h +++ b/internal/core/src/mmap/Types.h @@ -13,17 +13,19 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #pragma once +#include #include +#include +#include "storage/FieldData.h" -namespace milvus::ChunkMangerConfig { +namespace milvus { -void -SetLocalRootPath(const std::string_view path_prefix); - -std::string -GetLocalRootPath(); - -} // namespace milvus::ChunkMangerConfig +struct FieldDataInfo { + int64_t field_id; + int64_t row_count; + std::vector datas; + std::string mmap_dir_path; +}; +} // namespace milvus diff --git a/internal/core/src/mmap/Utils.h b/internal/core/src/mmap/Utils.h new file mode 100644 index 0000000000..3d34ee495e --- /dev/null +++ b/internal/core/src/mmap/Utils.h @@ -0,0 +1,232 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/FieldMeta.h" +#include "mmap/Types.h" +#include "storage/Util.h" + +namespace milvus { + +inline size_t +GetDataSize(const std::vector& datas) { + size_t total_size{0}; + for (auto data : datas) { + total_size += data->Size(); + } + + return total_size; +} + +inline void* +FillField(DataType data_type, const storage::FieldDataPtr data, void* dst) { + char* dest = reinterpret_cast(dst); + if (datatype_is_variable(data_type)) { + switch (data_type) { + case DataType::STRING: + case DataType::VARCHAR: { + for (ssize_t i = 0; i < data->get_num_rows(); ++i) { + auto str = + static_cast(data->RawValue(i)); + memcpy(dest, str->data(), str->size()); + dest += str->size(); + } + break; + } + case DataType::JSON: { + for (ssize_t i = 0; i < data->get_num_rows(); ++i) { + auto padded_string = + static_cast(data->RawValue(i))->data(); + memcpy(dest, padded_string.data(), padded_string.size()); + dest += padded_string.size(); + } + break; + } + default: + PanicInfo(fmt::format("not supported data type {}", + datatype_name(data_type))); + } + } else { + memcpy(dst, data->Data(), data->Size()); + dest += data->Size(); + } + + return dest; +} + +inline ssize_t +WriteFieldData(int fd, DataType data_type, const storage::FieldDataPtr data) { + ssize_t total_written{0}; + if (datatype_is_variable(data_type)) { + switch (data_type) { + case DataType::VARCHAR: + case DataType::STRING: { + for (ssize_t i = 0; i < data->get_num_rows(); ++i) { + auto str = + static_cast(data->RawValue(i)); + ssize_t written = write(fd, str->data(), str->size()); + if (written < str->size()) { + break; + } + total_written += written; + } + break; + } + case DataType::JSON: { + for (ssize_t i = 0; i < data->get_num_rows(); ++i) { + auto padded_string = + static_cast(data->RawValue(i))->data(); + ssize_t written = + write(fd, padded_string.data(), padded_string.size()); + if (written < padded_string.size()) { + break; + } + total_written += written; + } + break; + } + default: + PanicInfo(fmt::format("not supported data type {}", + datatype_name(data_type))); + } + } else { + total_written += write(fd, data->Data(), data->Size()); + } + + return total_written; +} + +// CreateMap creates a memory mapping, +// if mmap enabled, this writes field data to disk and create a map to the file, +// otherwise this just alloc memory +inline void* +CreateMap(int64_t segment_id, + const FieldMeta& field_meta, + const FieldDataInfo& info) { + static int mmap_flags = MAP_PRIVATE; +#ifdef MAP_POPULATE + // macOS doesn't support MAP_POPULATE + mmap_flags |= MAP_POPULATE; +#endif + + // simdjson requires a padding following the json data + size_t padding = field_meta.get_data_type() == DataType::JSON + ? simdjson::SIMDJSON_PADDING + : 0; + auto data_size = GetDataSize(info.datas); + // Allocate memory + if (info.mmap_dir_path.empty()) { + auto data_type = field_meta.get_data_type(); + if (data_size == 0) + return nullptr; + + // Use anon mapping so we are able to free these memory with munmap only + void* map = mmap(nullptr, + data_size + padding, + PROT_READ | PROT_WRITE, + mmap_flags | MAP_ANON, + -1, + 0); + AssertInfo( + map != MAP_FAILED, + fmt::format("failed to create anon map, err: {}", strerror(errno))); + auto dst = map; + for (auto data : info.datas) { + dst = FillField(data_type, data, dst); + } + return map; + } + + auto filepath = std::filesystem::path(info.mmap_dir_path) / + std::to_string(segment_id) / std::to_string(info.field_id); + auto dir = filepath.parent_path(); + std::filesystem::create_directories(dir); + + int fd = + open(filepath.c_str(), O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR); + AssertInfo(fd != -1, + fmt::format("failed to create mmap file {}", filepath.c_str())); + + auto data_type = field_meta.get_data_type(); + ssize_t total_written{0}; + for (auto data : info.datas) { + auto written = WriteFieldData(fd, data_type, data); + if (written != data->Size()) { + break; + } + total_written += written; + } + AssertInfo( + total_written == data_size || + total_written != -1 && + datatype_is_variable(field_meta.get_data_type()), + fmt::format( + "failed to write data file {}, written {} but total {}, err: {}", + filepath.c_str(), + total_written, + data_size, + strerror(errno))); + int ok = fsync(fd); + AssertInfo(ok == 0, + fmt::format("failed to fsync mmap data file {}, err: {}", + filepath.c_str(), + strerror(errno))); + + // Empty field + if (total_written == 0) { + return nullptr; + } + + auto map = + mmap(nullptr, total_written + padding, PROT_READ, mmap_flags, fd, 0); + AssertInfo(map != MAP_FAILED, + fmt::format("failed to create map for data file {}, err: {}", + filepath.c_str(), + strerror(errno))); + +#ifndef MAP_POPULATE + // Manually access the mapping to populate it + const size_t page_size = getpagesize(); + char* begin = (char*)map; + char* end = begin + total_written; + for (char* page = begin; page < end; page += page_size) { + char value = page[0]; + } +#endif + // unlink this data file so + // then it will be auto removed after we don't need it again + ok = unlink(filepath.c_str()); + AssertInfo(ok == 0, + fmt::format("failed to unlink mmap data file {}, err: {}", + filepath.c_str(), + strerror(errno))); + ok = close(fd); + AssertInfo(ok == 0, + fmt::format("failed to close data file {}, err: {}", + filepath.c_str(), + strerror(errno))); + return map; +} +} // namespace milvus diff --git a/internal/core/src/segcore/CMakeLists.txt b/internal/core/src/segcore/CMakeLists.txt index d32773f58a..af04c11d6c 100644 --- a/internal/core/src/segcore/CMakeLists.txt +++ b/internal/core/src/segcore/CMakeLists.txt @@ -30,6 +30,7 @@ set(SEGCORE_FILES plan_c.cpp reduce_c.cpp load_index_c.cpp + load_field_data_c.cpp SegmentInterface.cpp SegcoreConfig.cpp IndexConfigGenerator.cpp diff --git a/internal/core/src/segcore/ConcurrentVector.cpp b/internal/core/src/segcore/ConcurrentVector.cpp index 9cdb714c7e..9ddbd1c085 100644 --- a/internal/core/src/segcore/ConcurrentVector.cpp +++ b/internal/core/src/segcore/ConcurrentVector.cpp @@ -91,80 +91,4 @@ VectorBase::set_data_raw(ssize_t element_offset, } } -void -VectorBase::fill_chunk_data(ssize_t element_count, - const DataArray* data, - const FieldMeta& field_meta) { - if (field_meta.is_vector()) { - if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) { - return fill_chunk_data(VEC_FIELD_DATA(data, float).data(), - element_count); - } else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) { - return fill_chunk_data(VEC_FIELD_DATA(data, binary), element_count); - } else { - PanicInfo("unsupported"); - } - } - - switch (field_meta.get_data_type()) { - case DataType::BOOL: { - return fill_chunk_data(FIELD_DATA(data, bool).data(), - element_count); - } - case DataType::INT8: { - auto& src_data = FIELD_DATA(data, int); - std::vector data_raw(src_data.size()); - std::copy_n(src_data.data(), src_data.size(), data_raw.data()); - return fill_chunk_data(data_raw.data(), element_count); - } - case DataType::INT16: { - auto& src_data = FIELD_DATA(data, int); - std::vector data_raw(src_data.size()); - std::copy_n(src_data.data(), src_data.size(), data_raw.data()); - return fill_chunk_data(data_raw.data(), element_count); - } - case DataType::INT32: { - return fill_chunk_data(FIELD_DATA(data, int).data(), element_count); - } - case DataType::INT64: { - return fill_chunk_data(FIELD_DATA(data, long).data(), - element_count); - } - case DataType::FLOAT: { - return fill_chunk_data(FIELD_DATA(data, float).data(), - element_count); - } - case DataType::DOUBLE: { - return fill_chunk_data(FIELD_DATA(data, double).data(), - element_count); - } - case DataType::VARCHAR: { - auto vec = static_cast*>(this); - auto count = FIELD_DATA(data, string).size(); - vec->grow_on_demand(count); - auto& chunk = vec->get_chunk(0); - - size_t index = 0; - for (auto& str : FIELD_DATA(data, string)) { - chunk[index++] = str; - } - return; - } - case DataType::JSON: { - auto vec = static_cast*>(this); - auto count = FIELD_DATA(data, json).size(); - vec->grow_on_demand(count); - auto& chunk = vec->get_chunk(0); - - size_t index = 0; - for (auto& str : FIELD_DATA(data, json)) { - chunk[index++] = Json(simdjson::padded_string(str)); - } - return; - } - default: { - PanicInfo("unsupported"); - } - } -} } // namespace milvus::segcore diff --git a/internal/core/src/segcore/ConcurrentVector.h b/internal/core/src/segcore/ConcurrentVector.h index 857e4b5f99..d4af000cc7 100644 --- a/internal/core/src/segcore/ConcurrentVector.h +++ b/internal/core/src/segcore/ConcurrentVector.h @@ -31,6 +31,7 @@ #include "common/Types.h" #include "common/Utils.h" #include "exceptions/EasyAssert.h" +#include "storage/FieldData.h" namespace milvus::segcore { @@ -100,6 +101,10 @@ class VectorBase { const void* source, ssize_t element_count) = 0; + virtual void + set_data_raw(ssize_t element_offset, + const std::vector& data) = 0; + void set_data_raw(ssize_t element_offset, ssize_t element_count, @@ -107,12 +112,7 @@ class VectorBase { const FieldMeta& field_meta); virtual void - fill_chunk_data(const void* source, ssize_t element_count) = 0; - - void - fill_chunk_data(ssize_t element_count, - const DataArray* data, - const FieldMeta& field_meta); + fill_chunk_data(const std::vector& data) = 0; virtual SpanBase get_span_base(int64_t chunk_id) const = 0; @@ -196,13 +196,32 @@ class ConcurrentVectorImpl : public VectorBase { } void - fill_chunk_data(const void* source, ssize_t element_count) override { - if (element_count == 0) { - return; - } + fill_chunk_data(const std::vector& datas) + override { // used only for sealed segment AssertInfo(chunks_.size() == 0, "no empty concurrent vector"); + + int64_t element_count = 0; + for (auto& field_data : datas) { + element_count += field_data->get_num_rows(); + } chunks_.emplace_to_at_least(1, Dim * element_count); - set_data(0, static_cast(source), element_count); + int64_t offset = 0; + for (auto& field_data : datas) { + auto num_rows = field_data->get_num_rows(); + set_data( + offset, static_cast(field_data->Data()), num_rows); + offset += num_rows; + } + } + + void + set_data_raw(ssize_t element_offset, + const std::vector& datas) override { + for (auto& field_data : datas) { + auto num_rows = field_data->get_num_rows(); + set_data_raw(element_offset, field_data->Data(), num_rows); + element_offset += num_rows; + } } void diff --git a/internal/core/src/segcore/FieldIndexing.h b/internal/core/src/segcore/FieldIndexing.h index c20f674e97..10752f432f 100644 --- a/internal/core/src/segcore/FieldIndexing.h +++ b/internal/core/src/segcore/FieldIndexing.h @@ -276,6 +276,27 @@ class IndexingRecord { } } + // concurrent, reentrant + template + void + AppendingIndex(int64_t reserved_offset, + int64_t size, + FieldId fieldId, + const storage::FieldDataPtr data, + const InsertRecord& record) { + if (is_in(fieldId)) { + auto& indexing = field_indexings_.at(fieldId); + if (indexing->get_field_meta().is_vector() && + indexing->get_field_meta().get_data_type() == + DataType::VECTOR_FLOAT && + reserved_offset + size >= indexing->get_build_threshold()) { + auto vec_base = record.get_field_data_base(fieldId); + indexing->AppendSegmentIndex( + reserved_offset, size, vec_base, data->Data()); + } + } + } + void GetDataFromIndex(FieldId fieldId, const int64_t* seg_offsets, diff --git a/internal/core/src/segcore/InsertRecord.h b/internal/core/src/segcore/InsertRecord.h index b1725166a8..7bb1934b28 100644 --- a/internal/core/src/segcore/InsertRecord.h +++ b/internal/core/src/segcore/InsertRecord.h @@ -247,6 +247,37 @@ struct InsertRecord { return res_offsets; } + void + insert_pks(const std::vector& field_datas) { + std::lock_guard lck(shared_mutex_); + int64_t offset = 0; + for (auto& data : field_datas) { + int64_t row_count = data->get_num_rows(); + auto data_type = data->get_data_type(); + switch (data_type) { + case DataType::INT64: { + for (int i = 0; i < row_count; ++i) { + pk2offset_->insert( + *static_cast(data->RawValue(i)), + offset++); + } + break; + } + case DataType::VARCHAR: { + for (int i = 0; i < row_count; ++i) { + pk2offset_->insert( + *static_cast(data->RawValue(i)), + offset++); + } + break; + } + default: { + PanicInfo("unsupported primary key data type"); + } + } + } + } + std::vector search_pk(const PkType& pk, int64_t insert_barrier) const { std::shared_lock lck(shared_mutex_); diff --git a/internal/core/src/segcore/SegmentGrowingImpl.cpp b/internal/core/src/segcore/SegmentGrowingImpl.cpp index c4a769151e..5f031c7627 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.cpp +++ b/internal/core/src/segcore/SegmentGrowingImpl.cpp @@ -23,6 +23,8 @@ #include "query/SearchOnSealed.h" #include "segcore/SegmentGrowingImpl.h" #include "segcore/Utils.h" +#include "storage/RemoteChunkManagerSingleton.h" +#include "storage/Util.h" namespace milvus::segcore { @@ -112,6 +114,77 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset, reserved_offset + size); } +void +SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) { + // schema don't include system field + AssertInfo(infos.field_infos.size() == schema_->size() + 2, + "lost some field data when load for growing segment"); + AssertInfo(infos.field_infos.find(TimestampFieldID.get()) != + infos.field_infos.end(), + "timestamps field data should be included"); + AssertInfo( + infos.field_infos.find(RowFieldID.get()) != infos.field_infos.end(), + "rowID field data should be included"); + auto primary_field_id = + schema_->get_primary_field_id().value_or(FieldId(-1)); + AssertInfo(primary_field_id.get() != INVALID_FIELD_ID, "Primary key is -1"); + AssertInfo(infos.field_infos.find(primary_field_id.get()) != + infos.field_infos.end(), + "primary field data should be included"); + + int64_t num_rows = 0; + for (auto& field : infos.field_infos) { + num_rows = field.second.row_count; + break; + } + auto reserved_offset = PreInsert(num_rows); + for (auto& [id, info] : infos.field_infos) { + auto field_id = FieldId(id); + auto insert_files = info.insert_files; + auto field_datas = LoadFieldDatasFromRemote(insert_files); + AssertInfo( + num_rows == storage::GetTotalNumRowsForFieldDatas(field_datas), + "inconsistent num row between multi fields"); + + if (field_id == TimestampFieldID) { + // step 2: sort timestamp + // query node already guarantees that the timestamp is ordered, avoid field data copy in c++ + + // step 3: fill into Segment.ConcurrentVector + insert_record_.timestamps_.set_data_raw(reserved_offset, + field_datas); + continue; + } + + if (field_id == RowFieldID) { + insert_record_.row_ids_.set_data_raw(reserved_offset, field_datas); + continue; + } + + if (!indexing_record_.SyncDataWithIndex(field_id)) { + insert_record_.get_field_data_base(field_id)->set_data_raw( + reserved_offset, field_datas); + } + if (segcore_config_.get_enable_growing_segment_index()) { + auto offset = reserved_offset; + for (auto data : field_datas) { + auto row_count = data->get_num_rows(); + indexing_record_.AppendingIndex( + offset, row_count, field_id, data, insert_record_); + offset += row_count; + } + } + + if (field_id == primary_field_id) { + insert_record_.insert_pks(field_datas); + } + } + + // step 5: update small indexes + insert_record_.ack_responder_.AddSegment(reserved_offset, + reserved_offset + num_rows); +} + Status SegmentGrowingImpl::Delete(int64_t reserved_begin, int64_t size, diff --git a/internal/core/src/segcore/SegmentGrowingImpl.h b/internal/core/src/segcore/SegmentGrowingImpl.h index 2ed4b22ab4..ddaa357816 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.h +++ b/internal/core/src/segcore/SegmentGrowingImpl.h @@ -62,6 +62,9 @@ class SegmentGrowingImpl : public SegmentGrowing { void LoadDeletedRecord(const LoadDeletedRecordInfo& info) override; + void + LoadFieldData(const LoadFieldDataInfo& info) override; + std::string debug() const override; diff --git a/internal/core/src/segcore/SegmentInterface.h b/internal/core/src/segcore/SegmentInterface.h index da3dd0e1da..f7885daaf1 100644 --- a/internal/core/src/segcore/SegmentInterface.h +++ b/internal/core/src/segcore/SegmentInterface.h @@ -83,6 +83,9 @@ class SegmentInterface { virtual void LoadDeletedRecord(const LoadDeletedRecordInfo& info) = 0; + virtual void + LoadFieldData(const LoadFieldDataInfo& info) = 0; + virtual int64_t get_segment_id() const = 0; diff --git a/internal/core/src/segcore/SegmentSealed.h b/internal/core/src/segcore/SegmentSealed.h index 70c831704e..8239bfb597 100644 --- a/internal/core/src/segcore/SegmentSealed.h +++ b/internal/core/src/segcore/SegmentSealed.h @@ -18,6 +18,7 @@ #include "pb/segcore.pb.h" #include "segcore/SegmentInterface.h" #include "segcore/Types.h" +#include "mmap/Column.h" namespace milvus::segcore { @@ -28,11 +29,11 @@ class SegmentSealed : public SegmentInternalInterface { virtual void LoadSegmentMeta(const milvus::proto::segcore::LoadSegmentMeta& meta) = 0; virtual void - LoadFieldData(const LoadFieldDataInfo& info) = 0; - virtual void DropIndex(const FieldId field_id) = 0; virtual void DropFieldData(const FieldId field_id) = 0; + virtual void + LoadFieldData(FieldId field_id, const FieldDataInfo& data_info) = 0; SegmentType type() const override { diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 60fb81a427..97ccae7894 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -21,7 +21,7 @@ #include "Utils.h" #include "Types.h" -#include "common/Column.h" +#include "mmap/Column.h" #include "common/Consts.h" #include "common/FieldMeta.h" #include "common/Types.h" @@ -29,7 +29,7 @@ #include "query/ScalarIndex.h" #include "query/SearchBruteForce.h" #include "query/SearchOnSealed.h" -#include "index/Utils.h" +#include "storage/Util.h" namespace milvus::segcore { @@ -166,52 +166,73 @@ SegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) { } void -SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) { +SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& load_info) { // print(info); // NOTE: lock only when data is ready to avoid starvation - AssertInfo(info.row_count > 0, "The row count of field data is 0"); - auto field_id = FieldId(info.field_id); - AssertInfo(info.field_data != nullptr, "Field info blob is null"); - auto size = info.row_count; + // only one field for now, parallel load field data in golang + for (auto& [id, info] : load_info.field_infos) { + AssertInfo(info.row_count > 0, "The row count of field data is 0"); + auto field_id = FieldId(id); + auto insert_files = info.insert_files; + auto field_datas = LoadFieldDatasFromRemote(insert_files); + int64_t num_rows = storage::GetTotalNumRowsForFieldDatas(field_datas); + AssertInfo(num_rows == info.row_count, + "inconsistent field data row count with meta"); + auto field_data_info = FieldDataInfo{ + field_id.get(), num_rows, field_datas, load_info.mmap_dir_path}; + LoadFieldData(field_id, field_data_info); + } +} + +void +SegmentSealedImpl::LoadFieldData(FieldId field_id, + const FieldDataInfo& data_info) { + auto num_rows = data_info.row_count; if (row_count_opt_.has_value()) { - AssertInfo( - row_count_opt_.value() == size, - fmt::format( - "field {} has different row count {} to other column's {}", - field_id.get(), - size, - row_count_opt_.value())); + AssertInfo(row_count_opt_.value() == num_rows, + "field (" + std::to_string(field_id.get()) + + ") data has different row count (" + + std::to_string(num_rows) + + ") than other column's row count (" + + std::to_string(row_count_opt_.value()) + ")"); } if (SystemProperty::Instance().IsSystem(field_id)) { auto system_field_type = SystemProperty::Instance().GetSystemFieldType(field_id); if (system_field_type == SystemFieldType::Timestamp) { - auto timestamps = reinterpret_cast( - FIELD_DATA(info.field_data, long).data()); + std::vector timestamps(num_rows); + int64_t offset = 0; + for (auto& data : data_info.datas) { + int64_t row_count = data->get_num_rows(); + std::copy_n(static_cast(data->Data()), + row_count, + timestamps.data() + offset); + offset += row_count; + } TimestampIndex index; - auto min_slice_length = size < 4096 ? 1 : 4096; - auto meta = GenerateFakeSlices(timestamps, size, min_slice_length); + auto min_slice_length = num_rows < 4096 ? 1 : 4096; + auto meta = GenerateFakeSlices( + timestamps.data(), num_rows, min_slice_length); index.set_length_meta(std::move(meta)); - index.build_with(timestamps, size); + // todo ::opt to avoid copy timestamps from field data + index.build_with(timestamps.data(), num_rows); // use special index std::unique_lock lck(mutex_); AssertInfo(insert_record_.timestamps_.empty(), "already exists"); - insert_record_.timestamps_.fill_chunk_data(timestamps, size); + insert_record_.timestamps_.fill_chunk_data(data_info.datas); insert_record_.timestamp_index_ = std::move(index); AssertInfo(insert_record_.timestamps_.num_chunk() == 1, "num chunk not equal to 1 for sealed segment"); } else { AssertInfo(system_field_type == SystemFieldType::RowId, "System field type of id column is not RowId"); - auto row_ids = reinterpret_cast( - FIELD_DATA(info.field_data, long).data()); // write data under lock std::unique_lock lck(mutex_); AssertInfo(insert_record_.row_ids_.empty(), "already exists"); - insert_record_.row_ids_.fill_chunk_data(row_ids, size); + insert_record_.row_ids_.fill_chunk_data(data_info.datas); AssertInfo(insert_record_.row_ids_.num_chunk() == 1, "num chunk not equal to 1 for sealed segment"); } @@ -220,36 +241,33 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) { // prepare data auto& field_meta = (*schema_)[field_id]; auto data_type = field_meta.get_data_type(); - AssertInfo(data_type == DataType(info.field_data->type()), - "field type of load data is inconsistent with the schema"); // Don't allow raw data and index exist at the same time AssertInfo(!get_bit(index_ready_bitset_, field_id), "field data can't be loaded when indexing exists"); - size_t size = 0; if (datatype_is_variable(data_type)) { std::unique_ptr column{}; switch (data_type) { case milvus::DataType::STRING: case milvus::DataType::VARCHAR: { column = std::make_unique>( - get_segment_id(), field_meta, info); + get_segment_id(), field_meta, data_info); break; } case milvus::DataType::JSON: { column = std::make_unique>( - get_segment_id(), field_meta, info); + get_segment_id(), field_meta, data_info); } default: { } } - size = column->size(); + std::unique_lock lck(mutex_); variable_fields_.emplace(field_id, std::move(column)); } else { - auto column = Column(get_segment_id(), field_meta, info); - size = column.size(); + auto column = Column(get_segment_id(), field_meta, data_info); + std::unique_lock lck(mutex_); fixed_fields_.emplace(field_id, std::move(column)); } @@ -258,19 +276,15 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) { if (schema_->get_primary_field_id() == field_id) { AssertInfo(field_id.get() != -1, "Primary key is -1"); AssertInfo(insert_record_.empty_pks(), "already exists"); - std::vector pks(info.row_count); - ParsePksFromFieldData(pks, *info.field_data); - - for (int i = 0; i < info.row_count; ++i) { - insert_record_.insert_pk(pks[i], i); - } + insert_record_.insert_pks(data_info.datas); insert_record_.seal_pks(); } + std::unique_lock lck(mutex_); set_bit(field_data_ready_bitset_, field_id, true); } std::unique_lock lck(mutex_); - update_row_count(info.row_count); + update_row_count(num_rows); } void diff --git a/internal/core/src/segcore/SegmentSealedImpl.h b/internal/core/src/segcore/SegmentSealedImpl.h index 947b10db27..9fdd70d6f8 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.h +++ b/internal/core/src/segcore/SegmentSealedImpl.h @@ -28,7 +28,7 @@ #include "SealedIndexingRecord.h" #include "SegmentSealed.h" #include "TimestampIndex.h" -#include "common/Column.h" +#include "mmap/Column.h" #include "index/ScalarIndex.h" #include "sys/mman.h" @@ -55,6 +55,8 @@ class SegmentSealedImpl : public SegmentSealed { HasIndex(FieldId field_id) const override; bool HasFieldData(FieldId field_id) const override; + void + LoadFieldData(FieldId field_id, const FieldDataInfo& data_info) override; int64_t get_segment_id() const override { diff --git a/internal/core/src/segcore/Types.h b/internal/core/src/segcore/Types.h index 15dee7aeee..f3c85d67c2 100644 --- a/internal/core/src/segcore/Types.h +++ b/internal/core/src/segcore/Types.h @@ -41,7 +41,6 @@ struct LoadIndexInfo { std::map index_params; std::vector index_files; index::IndexBasePtr index; - storage::StorageConfig storage_config; }; } // namespace milvus::segcore diff --git a/internal/core/src/segcore/Utils.cpp b/internal/core/src/segcore/Utils.cpp index 41ec590c0f..77e63d0da7 100644 --- a/internal/core/src/segcore/Utils.cpp +++ b/internal/core/src/segcore/Utils.cpp @@ -12,8 +12,11 @@ #include "segcore/Utils.h" #include -#include "common/Utils.h" #include "index/ScalarIndex.h" +#include "storage/RemoteChunkManagerSingleton.h" +#include "common/Common.h" +#include "storage/Util.h" +#include "mmap/Utils.h" namespace milvus::segcore { @@ -37,6 +40,37 @@ ParsePksFromFieldData(std::vector& pks, const DataArray& data) { } } +void +ParsePksFromFieldData(DataType data_type, + std::vector& pks, + const std::vector& datas) { + int64_t offset = 0; + + for (auto& field_data : datas) { + AssertInfo(data_type == field_data->get_data_type(), + "inconsistent data type when parse pk from field data"); + int64_t row_count = field_data->get_num_rows(); + switch (data_type) { + case DataType::INT64: { + std::copy_n(static_cast(field_data->Data()), + row_count, + pks.data() + offset); + break; + } + case DataType::VARCHAR: { + std::copy_n(static_cast(field_data->Data()), + row_count, + pks.data() + offset); + break; + } + default: { + PanicInfo("unsupported"); + } + } + offset += row_count; + } +} + void ParsePksFromIDs(std::vector& pks, DataType data_type, @@ -509,5 +543,47 @@ ReverseDataFromIndex(const index::IndexBase* index, return data_array; } +// init segcore storage config first, and create default remote chunk manager +// segcore use default remote chunk manager to load data from minio/s3 +std::vector +LoadFieldDatasFromRemote(std::vector& remote_files) { + auto rcm = storage::RemoteChunkManagerSingleton::GetInstance() + .GetRemoteChunkManager(); + std::sort(remote_files.begin(), + remote_files.end(), + [](const std::string& a, const std::string& b) { + return std::stol(a.substr(a.find_last_of("/") + 1)) < + std::stol(b.substr(b.find_last_of("/") + 1)); + }); + + auto parallel_degree = + uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE); + std::vector batch_files; + std::vector field_datas; + + auto FetchRawData = [&]() { + auto raw_datas = GetObjectData(rcm.get(), batch_files); + for (auto& data : raw_datas) { + field_datas.emplace_back(data); + } + }; + + for (auto& file : remote_files) { + if (batch_files.size() >= parallel_degree) { + FetchRawData(); + batch_files.clear(); + } + + batch_files.emplace_back(file); + } + + if (batch_files.size() > 0) { + FetchRawData(); + } + + AssertInfo(field_datas.size() == remote_files.size(), + "inconsistent file num and raw data num!"); + return field_datas; +} } // namespace milvus::segcore diff --git a/internal/core/src/segcore/Utils.h b/internal/core/src/segcore/Utils.h index 0e1a8ccd2e..578cb3f1d7 100644 --- a/internal/core/src/segcore/Utils.h +++ b/internal/core/src/segcore/Utils.h @@ -28,6 +28,11 @@ namespace milvus::segcore { void ParsePksFromFieldData(std::vector& pks, const DataArray& data); +void +ParsePksFromFieldData(DataType data_type, + std::vector& pks, + const std::vector& datas); + void ParsePksFromIDs(std::vector& pks, DataType data_type, @@ -141,4 +146,7 @@ ReverseDataFromIndex(const index::IndexBase* index, int64_t count, const FieldMeta& field_meta); +std::vector +LoadFieldDatasFromRemote(std::vector& remote_files); + } // namespace milvus::segcore diff --git a/internal/core/src/segcore/load_field_data_c.cpp b/internal/core/src/segcore/load_field_data_c.cpp new file mode 100644 index 0000000000..882fe1cee5 --- /dev/null +++ b/internal/core/src/segcore/load_field_data_c.cpp @@ -0,0 +1,83 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "common/CGoHelper.h" +#include "common/LoadInfo.h" +#include "segcore/load_field_data_c.h" + +CStatus +NewLoadFieldDataInfo(CLoadFieldDataInfo* c_load_field_data_info) { + try { + auto load_field_data_info = std::make_unique(); + *c_load_field_data_info = load_field_data_info.release(); + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(UnexpectedError, e.what()); + } +} + +void +DeleteLoadFieldDataInfo(CLoadFieldDataInfo c_load_field_data_info) { + auto info = (LoadFieldDataInfo*)c_load_field_data_info; + delete info; +} + +CStatus +AppendLoadFieldInfo(CLoadFieldDataInfo c_load_field_data_info, + int64_t field_id, + int64_t row_count) { + try { + auto load_field_data_info = (LoadFieldDataInfo*)c_load_field_data_info; + auto iter = load_field_data_info->field_infos.find(field_id); + if (iter != load_field_data_info->field_infos.end()) { + throw std::runtime_error("append same field info multi times"); + } + FieldBinlogInfo binlog_info; + binlog_info.field_id = field_id; + binlog_info.row_count = row_count; + load_field_data_info->field_infos[field_id] = binlog_info; + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(UnexpectedError, e.what()); + } +} + +CStatus +AppendLoadFieldDataPath(CLoadFieldDataInfo c_load_field_data_info, + int64_t field_id, + const char* c_file_path) { + try { + auto load_field_data_info = (LoadFieldDataInfo*)c_load_field_data_info; + auto iter = load_field_data_info->field_infos.find(field_id); + std::string file_path(c_file_path); + if (iter == load_field_data_info->field_infos.end()) { + throw std::runtime_error("please append field info first"); + } + + load_field_data_info->field_infos[field_id].insert_files.emplace_back( + file_path); + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(UnexpectedError, e.what()); + } +} + +void +AppendMMapDirPath(CLoadFieldDataInfo c_load_field_data_info, + const char* c_dir_path) { + auto load_field_data_info = (LoadFieldDataInfo*)c_load_field_data_info; + load_field_data_info->mmap_dir_path = std::string(c_dir_path); +} diff --git a/internal/core/src/segcore/load_field_data_c.h b/internal/core/src/segcore/load_field_data_c.h new file mode 100644 index 0000000000..5133da222c --- /dev/null +++ b/internal/core/src/segcore/load_field_data_c.h @@ -0,0 +1,50 @@ + +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "common/type_c.h" + +typedef void* CLoadFieldDataInfo; + +CStatus +NewLoadFieldDataInfo(CLoadFieldDataInfo* c_load_field_data_info); + +void +DeleteLoadFieldDataInfo(CLoadFieldDataInfo c_load_field_data_info); + +CStatus +AppendLoadFieldInfo(CLoadFieldDataInfo c_load_field_data_info, + int64_t field_id, + int64_t row_count); + +CStatus +AppendLoadFieldDataPath(CLoadFieldDataInfo c_load_field_data_info, + int64_t field_id, + const char* file_path); + +void +AppendMMapDirPath(CLoadFieldDataInfo c_load_field_data_info, + const char* dir_path); + +#ifdef __cplusplus +} +#endif diff --git a/internal/core/src/segcore/load_index_c.cpp b/internal/core/src/segcore/load_index_c.cpp index 3778189f1d..e39bcd20e0 100644 --- a/internal/core/src/segcore/load_index_c.cpp +++ b/internal/core/src/segcore/load_index_c.cpp @@ -11,36 +11,20 @@ #include "segcore/load_index_c.h" -#include "common/CDataType.h" #include "common/FieldMeta.h" -#include "common/Utils.h" #include "index/IndexFactory.h" #include "index/Meta.h" #include "index/Utils.h" #include "segcore/Types.h" #include "storage/Util.h" +#include "storage/RemoteChunkManagerSingleton.h" +#include "storage/LocalChunkManagerSingleton.h" CStatus -NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info, - CStorageConfig c_storage_config) { +NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info) { try { auto load_index_info = std::make_unique(); - auto& storage_config = load_index_info->storage_config; - storage_config.address = std::string(c_storage_config.address); - storage_config.bucket_name = std::string(c_storage_config.bucket_name); - storage_config.access_key_id = - std::string(c_storage_config.access_key_id); - storage_config.access_key_value = - std::string(c_storage_config.access_key_value); - storage_config.remote_root_path = - std::string(c_storage_config.remote_root_path); - storage_config.storage_type = - std::string(c_storage_config.storage_type); - storage_config.iam_endpoint = - std::string(c_storage_config.iam_endpoint); - storage_config.useSSL = c_storage_config.useSSL; - storage_config.useIAM = c_storage_config.useIAM; *c_load_index_info = load_index_info.release(); auto status = CStatus(); @@ -143,11 +127,15 @@ appendVecIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) { load_index_info->field_id, load_index_info->index_build_id, load_index_info->index_version}; + auto remote_chunk_manager = + milvus::storage::RemoteChunkManagerSingleton::GetInstance() + .GetRemoteChunkManager(); auto file_manager = milvus::storage::CreateFileManager(index_info.index_type, field_meta, index_meta, - load_index_info->storage_config); + remote_chunk_manager); + AssertInfo(file_manager != nullptr, "create file manager failed!"); auto config = milvus::index::ParseConfigFromIndexParams( load_index_info->index_params); @@ -212,6 +200,69 @@ AppendIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) { return appendScalarIndex(c_load_index_info, c_binary_set); } +CStatus +AppendIndexV2(CLoadIndexInfo c_load_index_info) { + try { + auto load_index_info = + (milvus::segcore::LoadIndexInfo*)c_load_index_info; + auto& index_params = load_index_info->index_params; + auto field_type = load_index_info->field_type; + + milvus::index::CreateIndexInfo index_info; + index_info.field_type = load_index_info->field_type; + + // get index type + AssertInfo(index_params.find("index_type") != index_params.end(), + "index type is empty"); + index_info.index_type = index_params.at("index_type"); + + // get metric type + if (milvus::datatype_is_vector(field_type)) { + AssertInfo(index_params.find("metric_type") != index_params.end(), + "metric type is empty for vector index"); + index_info.metric_type = index_params.at("metric_type"); + } + + // init file manager + milvus::storage::FieldDataMeta field_meta{ + load_index_info->collection_id, + load_index_info->partition_id, + load_index_info->segment_id, + load_index_info->field_id}; + milvus::storage::IndexMeta index_meta{load_index_info->segment_id, + load_index_info->field_id, + load_index_info->index_build_id, + load_index_info->index_version}; + auto remote_chunk_manager = + milvus::storage::RemoteChunkManagerSingleton::GetInstance() + .GetRemoteChunkManager(); + auto file_manager = + milvus::storage::CreateFileManager(index_info.index_type, + field_meta, + index_meta, + remote_chunk_manager); + AssertInfo(file_manager != nullptr, "create file manager failed!"); + + auto config = milvus::index::ParseConfigFromIndexParams( + load_index_info->index_params); + config["index_files"] = load_index_info->index_files; + + load_index_info->index = + milvus::index::IndexFactory::GetInstance().CreateIndex( + index_info, file_manager); + load_index_info->index->Load(config); + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + CStatus AppendIndexFilePath(CLoadIndexInfo c_load_index_info, const char* c_file_path) { try { @@ -261,12 +312,14 @@ CleanLoadedIndex(CLoadIndexInfo c_load_index_info) { try { auto load_index_info = (milvus::segcore::LoadIndexInfo*)c_load_index_info; - auto index_file_path_prefix = milvus::storage::GenLocalIndexPathPrefix( - load_index_info->index_build_id, load_index_info->index_version); -#ifdef BUILD_DISK_ANN - milvus::storage::LocalChunkManager::GetInstance().RemoveDir( - index_file_path_prefix); -#endif + auto local_chunk_manager = + milvus::storage::LocalChunkManagerSingleton::GetInstance() + .GetChunkManager(); + auto index_file_path_prefix = + milvus::storage::GenIndexPathPrefix(local_chunk_manager, + load_index_info->index_build_id, + load_index_info->index_version); + local_chunk_manager->RemoveDir(index_file_path_prefix); auto status = CStatus(); status.error_code = Success; status.error_msg = ""; diff --git a/internal/core/src/segcore/load_index_c.h b/internal/core/src/segcore/load_index_c.h index 58ff7687df..224a08dbbd 100644 --- a/internal/core/src/segcore/load_index_c.h +++ b/internal/core/src/segcore/load_index_c.h @@ -24,8 +24,7 @@ extern "C" { typedef void* CLoadIndexInfo; CStatus -NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info, - CStorageConfig c_storage_config); +NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info); void DeleteLoadIndexInfo(CLoadIndexInfo c_load_index_info); @@ -55,6 +54,9 @@ AppendIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set); CStatus AppendIndexFilePath(CLoadIndexInfo c_load_index_info, const char* file_path); +CStatus +AppendIndexV2(CLoadIndexInfo c_load_index_info); + CStatus CleanLoadedIndex(CLoadIndexInfo c_load_index_info); diff --git a/internal/core/src/segcore/segment_c.cpp b/internal/core/src/segcore/segment_c.cpp index ff85dca509..3c3a13b12e 100644 --- a/internal/core/src/segcore/segment_c.cpp +++ b/internal/core/src/segcore/segment_c.cpp @@ -17,12 +17,12 @@ #include "common/Tracer.h" #include "common/type_c.h" #include "google/protobuf/text_format.h" -#include "index/IndexInfo.h" #include "log/Log.h" #include "segcore/Collection.h" #include "segcore/SegmentGrowingImpl.h" #include "segcore/SegmentSealedImpl.h" -#include "segcore/SegcoreConfig.h" +#include "storage/Util.h" +#include "mmap/Types.h" ////////////////////////////// common interfaces ////////////////////////////// CSegmentInterface @@ -228,22 +228,51 @@ Delete(CSegmentInterface c_segment, ////////////////////////////// interfaces for sealed segment ////////////////////////////// CStatus LoadFieldData(CSegmentInterface c_segment, - CLoadFieldDataInfo load_field_data_info) { + CLoadFieldDataInfo c_load_field_data_info) { + try { + auto segment = + reinterpret_cast(c_segment); + AssertInfo(segment != nullptr, "segment conversion failed"); + auto load_info = (LoadFieldDataInfo*)c_load_field_data_info; + segment->LoadFieldData(*load_info); + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(UnexpectedError, e.what()); + } +} + +// just for test +CStatus +LoadFieldRawData(CSegmentInterface c_segment, + int64_t field_id, + const void* data, + int64_t row_count) { try { auto segment_interface = reinterpret_cast(c_segment); auto segment = dynamic_cast(segment_interface); AssertInfo(segment != nullptr, "segment conversion failed"); - auto field_data = std::make_unique(); - auto suc = field_data->ParseFromArray(load_field_data_info.blob, - load_field_data_info.blob_size); - AssertInfo(suc, "unmarshal field data string failed"); - auto load_info = LoadFieldDataInfo{load_field_data_info.field_id, - field_data.get(), - load_field_data_info.row_count, - load_field_data_info.mmap_dir_path}; - segment->LoadFieldData(load_info); + milvus::DataType data_type; + int64_t dim = 1; + if (milvus::SystemProperty::Instance().IsSystem( + milvus::FieldId(field_id))) { + data_type = milvus::DataType::INT64; + } else { + auto field_meta = segment->get_schema()[milvus::FieldId(field_id)]; + data_type = field_meta.get_data_type(); + + if (milvus::datatype_is_vector(data_type)) { + dim = field_meta.get_dim(); + } + } + auto field_data = milvus::storage::CreateFieldData(data_type, dim); + field_data->FillFieldData(data, row_count); + auto field_data_info = milvus::FieldDataInfo{ + field_id, + row_count, + std::vector{field_data}}; + segment->LoadFieldData(milvus::FieldId(field_id), field_data_info); return milvus::SuccessCStatus(); } catch (std::exception& e) { return milvus::FailureCStatus(UnexpectedError, e.what()); diff --git a/internal/core/src/segcore/segment_c.h b/internal/core/src/segcore/segment_c.h index 97c4be382c..ec1348e1bb 100644 --- a/internal/core/src/segcore/segment_c.h +++ b/internal/core/src/segcore/segment_c.h @@ -22,6 +22,7 @@ extern "C" { #include "common/type_c.h" #include "segcore/plan_c.h" #include "segcore/load_index_c.h" +#include "segcore/load_field_data_c.h" typedef void* CSegmentInterface; typedef void* CSearchResult; @@ -88,6 +89,12 @@ CStatus LoadFieldData(CSegmentInterface c_segment, CLoadFieldDataInfo load_field_data_info); +CStatus +LoadFieldRawData(CSegmentInterface c_segment, + int64_t field_id, + const void* data, + int64_t row_count); + CStatus LoadDeletedRecord(CSegmentInterface c_segment, CLoadDeletedRecordInfo deleted_record_info); diff --git a/internal/core/src/storage/AliyunCredentialsProvider.cpp b/internal/core/src/storage/AliyunCredentialsProvider.cpp index 8736ffa4e8..f492c96db8 100644 --- a/internal/core/src/storage/AliyunCredentialsProvider.cpp +++ b/internal/core/src/storage/AliyunCredentialsProvider.cpp @@ -1,3 +1,14 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + /** * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0. diff --git a/internal/core/src/storage/AliyunCredentialsProvider.h b/internal/core/src/storage/AliyunCredentialsProvider.h index 86b41fd28d..16f378c02e 100644 --- a/internal/core/src/storage/AliyunCredentialsProvider.h +++ b/internal/core/src/storage/AliyunCredentialsProvider.h @@ -1,3 +1,14 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + /** * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0. diff --git a/internal/core/src/storage/AliyunSTSClient.cpp b/internal/core/src/storage/AliyunSTSClient.cpp index 0c29cd542b..cf54f9d4c9 100644 --- a/internal/core/src/storage/AliyunSTSClient.cpp +++ b/internal/core/src/storage/AliyunSTSClient.cpp @@ -1,3 +1,14 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + /** * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0. diff --git a/internal/core/src/storage/AliyunSTSClient.h b/internal/core/src/storage/AliyunSTSClient.h index 6cd343816b..942d4e7d77 100644 --- a/internal/core/src/storage/AliyunSTSClient.h +++ b/internal/core/src/storage/AliyunSTSClient.h @@ -1,3 +1,14 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + /** * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0. diff --git a/internal/core/src/storage/BinlogReader.cpp b/internal/core/src/storage/BinlogReader.cpp index efc86f4652..fae1a65a90 100644 --- a/internal/core/src/storage/BinlogReader.cpp +++ b/internal/core/src/storage/BinlogReader.cpp @@ -37,8 +37,8 @@ BinlogReader::Read(int64_t nbytes) { Status(SERVER_UNEXPECTED_ERROR, "out range of binlog data"), nullptr); } - auto res = std::shared_ptr(new uint8_t[nbytes]); - std::memcpy(res.get(), data_.get() + tell_, nbytes); + auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction + auto res = std::shared_ptr(data_.get() + tell_, deleter); tell_ += nbytes; return std::make_pair(Status(SERVER_SUCCESS, ""), res); } diff --git a/internal/core/src/storage/BinlogReader.h b/internal/core/src/storage/BinlogReader.h index 99c36f0dd4..480362a6af 100644 --- a/internal/core/src/storage/BinlogReader.h +++ b/internal/core/src/storage/BinlogReader.h @@ -31,12 +31,6 @@ class BinlogReader { : data_(binlog_data), size_(length), tell_(0) { } - explicit BinlogReader(const uint8_t* binlog_data, int64_t length) - : size_(length), tell_(0) { - data_ = std::shared_ptr(new uint8_t[length]); - std::memcpy(data_.get(), binlog_data, length); - } - Status Read(int64_t nbytes, void* out); diff --git a/internal/core/src/storage/CMakeLists.txt b/internal/core/src/storage/CMakeLists.txt index 968d3c4ca6..30a8893a92 100644 --- a/internal/core/src/storage/CMakeLists.txt +++ b/internal/core/src/storage/CMakeLists.txt @@ -31,22 +31,17 @@ set(STORAGE_FILES PayloadReader.cpp PayloadWriter.cpp BinlogReader.cpp - FieldDataFactory.cpp - IndexData.cpp + IndexData.cpp InsertData.cpp Event.cpp ThreadPool.cpp - storage_c.cpp) - -if(BUILD_DISK_ANN STREQUAL "ON") - set(STORAGE_FILES - ${STORAGE_FILES} - LocalChunkManager.cpp - MinioChunkManager.cpp - AliyunSTSClient.cpp - AliyunCredentialsProvider.cpp - DiskFileManagerImpl.cpp) -endif() + storage_c.cpp + MinioChunkManager.cpp + AliyunSTSClient.cpp + AliyunCredentialsProvider.cpp + MemFileManagerImpl.cpp + LocalChunkManager.cpp + DiskFileManagerImpl.cpp) add_library(milvus_storage SHARED ${STORAGE_FILES}) diff --git a/internal/core/src/storage/ChunkManager.h b/internal/core/src/storage/ChunkManager.h index 77b177d561..c80fef4c68 100644 --- a/internal/core/src/storage/ChunkManager.h +++ b/internal/core/src/storage/ChunkManager.h @@ -20,6 +20,7 @@ #include #include #include +#include namespace milvus::storage { @@ -112,23 +113,23 @@ class ChunkManager { */ virtual std::string GetName() const = 0; -}; -/** - * @brief RemoteChunkManager is responsible for read and write Remote file - * that inherited from ChunkManager. - */ - -class RemoteChunkManager : public ChunkManager { - public: - virtual ~RemoteChunkManager() { - } + /** + * @brief Get the Root Path + * @return std::string + */ virtual std::string - GetName() const { - return "RemoteChunkManager"; - } + GetRootPath() const = 0; }; -using RemoteChunkManagerPtr = std::unique_ptr; +using ChunkManagerPtr = std::shared_ptr; + +enum ChunkManagerType : int8_t { + None_CM = 0, + Local = 1, + Minio = 2, +}; + +extern std::map ChunkManagerType_Map; } // namespace milvus::storage diff --git a/internal/core/src/storage/DataCodec.cpp b/internal/core/src/storage/DataCodec.cpp index ad5c8c67c0..923e8ef57e 100644 --- a/internal/core/src/storage/DataCodec.cpp +++ b/internal/core/src/storage/DataCodec.cpp @@ -40,7 +40,7 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) { switch (header.event_type_) { case EventType::InsertEvent: { auto event_data_length = - header.event_length_ - header.next_position_; + header.event_length_ - GetEventHeaderSize(header); auto insert_event_data = InsertEventData(reader, event_data_length, data_type); auto insert_data = @@ -52,11 +52,26 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) { } case EventType::IndexFileEvent: { auto event_data_length = - header.event_length_ - header.next_position_; + header.event_length_ - GetEventHeaderSize(header); auto index_event_data = IndexEventData(reader, event_data_length, data_type); - auto index_data = - std::make_unique(index_event_data.field_data); + auto field_data = index_event_data.field_data; + // for compatible with golang indexcode.Serialize, which set dataType to String + if (data_type == DataType::STRING) { + AssertInfo(field_data->get_data_type() == DataType::STRING, + "wrong index type in index binlog file"); + AssertInfo( + field_data->get_num_rows() == 1, + "wrong length of string num in old index binlog file"); + auto new_field_data = CreateFieldData(DataType::INT8); + new_field_data->FillFieldData( + (*static_cast(field_data->RawValue(0))) + .c_str(), + field_data->Size()); + field_data = new_field_data; + } + + auto index_data = std::make_unique(field_data); index_data->SetFieldDataMeta(data_meta); IndexMeta index_meta; index_meta.segment_id = data_meta.segment_id; diff --git a/internal/core/src/storage/DiskFileManagerImpl.cpp b/internal/core/src/storage/DiskFileManagerImpl.cpp index 4cabd2a9ff..a6acc6e8e7 100644 --- a/internal/core/src/storage/DiskFileManagerImpl.cpp +++ b/internal/core/src/storage/DiskFileManagerImpl.cpp @@ -22,59 +22,28 @@ #include "common/Common.h" #include "common/Slice.h" #include "log/Log.h" -#include "config/ConfigKnowhere.h" + #include "storage/DiskFileManagerImpl.h" -#include "storage/LocalChunkManager.h" -#include "storage/MinioChunkManager.h" +#include "storage/LocalChunkManagerSingleton.h" #include "storage/Exception.h" -#include "storage/FieldData.h" #include "storage/IndexData.h" -#include "storage/ThreadPool.h" #include "storage/Util.h" -#include "storage/FieldDataFactory.h" - -#define FILEMANAGER_TRY try { -#define FILEMANAGER_CATCH \ - } \ - catch (LocalChunkManagerException & e) { \ - LOG_SEGCORE_ERROR_ << "LocalChunkManagerException:" << e.what(); \ - return false; \ - } \ - catch (MinioException & e) { \ - LOG_SEGCORE_ERROR_ << "milvus::storage::MinioException:" << e.what(); \ - return false; \ - } \ - catch (DiskANNFileManagerException & e) { \ - LOG_SEGCORE_ERROR_ << "milvus::storage::DiskANNFileManagerException:" \ - << e.what(); \ - return false; \ - } \ - catch (ArrowException & e) { \ - LOG_SEGCORE_ERROR_ << "milvus::storage::ArrowException:" << e.what(); \ - return false; \ - } \ - catch (std::exception & e) { \ - LOG_SEGCORE_ERROR_ << "Exception:" << e.what(); \ - return false; -#define FILEMANAGER_END } - -using ReadLock = std::shared_lock; -using WriteLock = std::lock_guard; +#include "storage/ThreadPool.h" namespace milvus::storage { -DiskFileManagerImpl::DiskFileManagerImpl(const FieldDataMeta& field_meta, +DiskFileManagerImpl::DiskFileManagerImpl(const FieldDataMeta& field_mata, IndexMeta index_meta, - const StorageConfig& storage_config) - : field_meta_(field_meta), index_meta_(std::move(index_meta)) { - remote_root_path_ = storage_config.remote_root_path; - rcm_ = std::make_unique(storage_config); + ChunkManagerPtr remote_chunk_manager) + : FileManagerImpl(field_mata, index_meta) { + rcm_ = remote_chunk_manager; } DiskFileManagerImpl::~DiskFileManagerImpl() { - auto& local_chunk_manager = LocalChunkManager::GetInstance(); - local_chunk_manager.RemoveDir( - GetLocalIndexPathPrefixWithBuildID(index_meta_.build_id)); + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + local_chunk_manager->RemoveDir(GetIndexPathPrefixWithBuildID( + local_chunk_manager, index_meta_.build_id)); } bool @@ -82,38 +51,19 @@ DiskFileManagerImpl::LoadFile(const std::string& file) noexcept { return true; } -std::pair -EncodeAndUploadIndexSlice(RemoteChunkManager* remote_chunk_manager, - const std::string& file, - int64_t offset, - int64_t batch_size, - const IndexMeta& index_meta, - FieldDataMeta field_meta, - std::string object_key) { - auto& local_chunk_manager = LocalChunkManager::GetInstance(); - auto buf = std::unique_ptr(new uint8_t[batch_size]); - local_chunk_manager.Read(file, offset, buf.get(), batch_size); - - auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - DataType::INT8); - field_data->FillFieldData(buf.get(), batch_size); - auto indexData = std::make_shared(field_data); - indexData->set_index_meta(index_meta); - indexData->SetFieldDataMeta(field_meta); - auto serialized_index_data = indexData->serialize_to_remote_file(); - auto serialized_index_size = serialized_index_data.size(); - remote_chunk_manager->Write( - object_key, serialized_index_data.data(), serialized_index_size); - return std::make_pair(std::move(object_key), serialized_index_size); +std::string +DiskFileManagerImpl::GetRemoteIndexPath(const std::string& file_name, + int64_t slice_num) const { + auto remote_prefix = GetRemoteIndexObjectPrefix(); + return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num); } bool DiskFileManagerImpl::AddFile(const std::string& file) noexcept { - auto& local_chunk_manager = LocalChunkManager::GetInstance(); - auto& pool = ThreadPool::GetInstance(); + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); FILEMANAGER_TRY - if (!local_chunk_manager.Exist(file)) { + if (!local_chunk_manager->Exist(file)) { LOG_SEGCORE_ERROR_ << "local file: " << file << " does not exist "; return false; } @@ -122,15 +72,15 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept { local_paths_.emplace_back(file); auto fileName = GetFileName(file); - auto fileSize = local_chunk_manager.Size(file); + auto fileSize = local_chunk_manager->Size(file); std::vector batch_remote_files; std::vector remote_file_sizes; std::vector local_file_offsets; int slice_num = 0; - auto parallel_degree = uint64_t(DEFAULT_DISK_INDEX_MAX_MEMORY_LIMIT / - (index_file_slice_size << 20)); + auto parallel_degree = + uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE); for (int64_t offset = 0; offset < fileSize; slice_num++) { if (batch_remote_files.size() >= parallel_degree) { AddBatchIndexFiles(file, @@ -142,10 +92,9 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept { local_file_offsets.clear(); } - auto batch_size = - std::min(index_file_slice_size << 20, int64_t(fileSize) - offset); + auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset); batch_remote_files.emplace_back( - GenerateRemoteIndexFile(fileName, slice_num)); + GetRemoteIndexPath(fileName, slice_num)); remote_file_sizes.emplace_back(batch_size); local_file_offsets.emplace_back(offset); offset += batch_size; @@ -166,35 +115,57 @@ DiskFileManagerImpl::AddBatchIndexFiles( const std::vector& local_file_offsets, const std::vector& remote_files, const std::vector& remote_file_sizes) { + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); auto& pool = ThreadPool::GetInstance(); - std::vector>> futures; + auto LoadIndexFromDisk = [&]( + const std::string& file, + const int64_t offset, + const int64_t data_size) -> std::shared_ptr { + auto buf = std::shared_ptr(new uint8_t[data_size]); + local_chunk_manager->Read(file, offset, buf.get(), data_size); + return buf; + }; + + std::vector>> futures; AssertInfo(local_file_offsets.size() == remote_files.size(), "inconsistent size of offset slices with file slices"); AssertInfo(remote_files.size() == remote_file_sizes.size(), "inconsistent size of file slices with size slices"); for (int64_t i = 0; i < remote_files.size(); ++i) { - futures.push_back(pool.Submit(EncodeAndUploadIndexSlice, - rcm_.get(), + futures.push_back(pool.Submit(LoadIndexFromDisk, local_file_name, local_file_offsets[i], - remote_file_sizes[i], - index_meta_, - field_meta_, - remote_files[i])); + remote_file_sizes[i])); } + // hold index data util upload index file done + std::vector> index_datas; + std::vector data_slices; for (auto& future : futures) { auto res = future.get(); - remote_paths_to_size_[res.first] = res.second; + index_datas.emplace_back(res); + data_slices.emplace_back(res.get()); + } + + auto res = PutIndexData(rcm_.get(), + data_slices, + remote_file_sizes, + remote_files, + field_meta_, + index_meta_); + for (auto iter = res.begin(); iter != res.end(); ++iter) { + remote_paths_to_size_[iter->first] = iter->second; } } void DiskFileManagerImpl::CacheIndexToDisk( const std::vector& remote_files) { - auto& local_chunk_manager = LocalChunkManager::GetInstance(); + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); std::map> index_slices; for (auto& file_path : remote_files) { @@ -209,7 +180,7 @@ DiskFileManagerImpl::CacheIndexToDisk( auto EstimateParallelDegree = [&](const std::string& file) -> uint64_t { auto fileSize = rcm_->Size(file); - return uint64_t(DEFAULT_DISK_INDEX_MAX_MEMORY_LIMIT / fileSize); + return uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / fileSize); }; for (auto& slices : index_slices) { @@ -217,7 +188,7 @@ DiskFileManagerImpl::CacheIndexToDisk( auto local_index_file_name = GetLocalIndexObjectPrefix() + prefix.substr(prefix.find_last_of('/') + 1); - local_chunk_manager.CreateFile(local_index_file_name); + local_chunk_manager->CreateFile(local_index_file_name); int64_t offset = 0; std::vector batch_remote_files; uint64_t max_parallel_degree = INT_MAX; @@ -245,72 +216,125 @@ DiskFileManagerImpl::CacheIndexToDisk( } } -std::unique_ptr -DownloadAndDecodeRemoteIndexfile(RemoteChunkManager* remote_chunk_manager, - const std::string& file) { - auto fileSize = remote_chunk_manager->Size(file); - auto buf = std::shared_ptr(new uint8_t[fileSize]); - remote_chunk_manager->Read(file, buf.get(), fileSize); - - return DeserializeFileData(buf, fileSize); -} - uint64_t DiskFileManagerImpl::CacheBatchIndexFilesToDisk( const std::vector& remote_files, const std::string& local_file_name, uint64_t local_file_init_offfset) { - auto& local_chunk_manager = LocalChunkManager::GetInstance(); - auto& pool = ThreadPool::GetInstance(); + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + auto index_datas = GetObjectData(rcm_.get(), remote_files); int batch_size = remote_files.size(); - - std::vector>> futures; - for (int i = 0; i < batch_size; ++i) { - futures.push_back(pool.Submit( - DownloadAndDecodeRemoteIndexfile, rcm_.get(), remote_files[i])); - } + AssertInfo(index_datas.size() == batch_size, + "inconsistent file num and index data num!"); uint64_t offset = local_file_init_offfset; for (int i = 0; i < batch_size; ++i) { - auto res = futures[i].get(); - auto index_data = res->GetFieldData(); + auto index_data = index_datas[i]; auto index_size = index_data->Size(); - local_chunk_manager.Write( - local_file_name, - offset, - reinterpret_cast(const_cast(index_data->Data())), - index_size); + auto uint8_data = + reinterpret_cast(const_cast(index_data->Data())); + local_chunk_manager->Write( + local_file_name, offset, uint8_data, index_size); offset += index_size; } - return offset; } +std::string +DiskFileManagerImpl::CacheRawDataToDisk(std::vector remote_files) { + std::sort(remote_files.begin(), + remote_files.end(), + [](const std::string& a, const std::string& b) { + return std::stol(a.substr(a.find_last_of("/") + 1)) < + std::stol(b.substr(b.find_last_of("/") + 1)); + }); + + auto segment_id = GetFieldDataMeta().segment_id; + auto field_id = GetFieldDataMeta().field_id; + + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + auto local_data_path = storage::GenFieldRawDataPathPrefix( + local_chunk_manager, segment_id, field_id) + + "raw_data"; + local_chunk_manager->CreateFile(local_data_path); + + // get batch raw data from s3 and write batch data to disk file + // TODO: load and write of different batches at the same time + std::vector batch_files; + + // file format + // num_rows(uint32) | dim(uint32) | index_data ([]uint8_t) + uint32_t num_rows = 0; + uint32_t dim = 0; + int64_t write_offset = sizeof(num_rows) + sizeof(dim); + + auto FetchRawData = [&]() { + auto field_datas = GetObjectData(rcm_.get(), batch_files); + int batch_size = batch_files.size(); + for (int i = 0; i < batch_size; ++i) { + auto field_data = field_datas[i]; + num_rows += uint32_t(field_data->get_num_rows()); + AssertInfo(dim == 0 || dim == field_data->get_dim(), + "inconsistent dim value in multi binlogs!"); + dim = field_data->get_dim(); + + auto data_size = field_data->get_num_rows() * dim * sizeof(float); + local_chunk_manager->Write(local_data_path, + write_offset, + const_cast(field_data->Data()), + data_size); + write_offset += data_size; + } + }; + + auto parallel_degree = + uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE); + for (auto& file : remote_files) { + if (batch_files.size() >= parallel_degree) { + FetchRawData(); + batch_files.clear(); + } + + batch_files.emplace_back(file); + } + + if (batch_files.size() > 0) { + FetchRawData(); + } + + // write num_rows and dim value to file header + write_offset = 0; + local_chunk_manager->Write( + local_data_path, write_offset, &num_rows, sizeof(num_rows)); + write_offset += sizeof(num_rows); + local_chunk_manager->Write( + local_data_path, write_offset, &dim, sizeof(dim)); + + return local_data_path; +} + std::string DiskFileManagerImpl::GetFileName(const std::string& localfile) { boost::filesystem::path localPath(localfile); return localPath.filename().string(); } -std::string -DiskFileManagerImpl::GetRemoteIndexObjectPrefix() const { - return remote_root_path_ + "/" + std::string(INDEX_ROOT_PATH) + "/" + - std::to_string(index_meta_.build_id) + "/" + - std::to_string(index_meta_.index_version) + "/" + - std::to_string(field_meta_.partition_id) + "/" + - std::to_string(field_meta_.segment_id); -} - std::string DiskFileManagerImpl::GetLocalIndexObjectPrefix() { - return GenLocalIndexPathPrefix(index_meta_.build_id, - index_meta_.index_version); + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + return GenIndexPathPrefix( + local_chunk_manager, index_meta_.build_id, index_meta_.index_version); } std::string DiskFileManagerImpl::GetLocalRawDataObjectPrefix() { - return GenFieldRawDataPathPrefix(field_meta_.segment_id, - field_meta_.field_id); + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + return GenFieldRawDataPathPrefix( + local_chunk_manager, field_meta_.segment_id, field_meta_.field_id); } bool @@ -322,9 +346,10 @@ DiskFileManagerImpl::RemoveFile(const std::string& file) noexcept { std::optional DiskFileManagerImpl::IsExisted(const std::string& file) noexcept { bool isExist = false; - auto& local_chunk_manager = LocalChunkManager::GetInstance(); + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); try { - isExist = local_chunk_manager.Exist(file); + isExist = local_chunk_manager->Exist(file); } catch (LocalChunkManagerException& e) { // LOG_SEGCORE_DEBUG_ << "LocalChunkManagerException:" // << e.what(); diff --git a/internal/core/src/storage/DiskFileManagerImpl.h b/internal/core/src/storage/DiskFileManagerImpl.h index 85c16687ac..4d79e8acc7 100644 --- a/internal/core/src/storage/DiskFileManagerImpl.h +++ b/internal/core/src/storage/DiskFileManagerImpl.h @@ -24,8 +24,7 @@ #include "storage/IndexData.h" #include "storage/FileManager.h" -#include "storage/LocalChunkManager.h" -#include "storage/MinioChunkManager.h" +#include "storage/ChunkManager.h" #include "common/Consts.h" @@ -33,9 +32,9 @@ namespace milvus::storage { class DiskFileManagerImpl : public FileManagerImpl { public: - explicit DiskFileManagerImpl(const FieldDataMeta& field_meta, + explicit DiskFileManagerImpl(const FieldDataMeta& field_mata, IndexMeta index_meta, - const StorageConfig& storage_config); + ChunkManagerPtr remote_chunk_manager); virtual ~DiskFileManagerImpl(); @@ -57,9 +56,6 @@ class DiskFileManagerImpl : public FileManagerImpl { return "DiskFileManagerImpl"; } - std::string - GetRemoteIndexObjectPrefix() const; - std::string GetLocalIndexObjectPrefix(); @@ -76,13 +72,6 @@ class DiskFileManagerImpl : public FileManagerImpl { return local_paths_; } - std::string - GenerateRemoteIndexFile(const std::string& file_name, - int64_t slice_num) const { - return GetRemoteIndexObjectPrefix() + "/" + file_name + "_" + - std::to_string(slice_num); - } - void CacheIndexToDisk(const std::vector& remote_files); @@ -97,15 +86,8 @@ class DiskFileManagerImpl : public FileManagerImpl { const std::vector& remote_files, const std::vector& remote_file_sizes); - FieldDataMeta - GetFileDataMeta() const { - return field_meta_; - } - - IndexMeta - GetIndexMeta() const { - return index_meta_; - } + std::string + CacheRawDataToDisk(std::vector remote_files); private: int64_t @@ -116,21 +98,15 @@ class DiskFileManagerImpl : public FileManagerImpl { std::string GetFileName(const std::string& localfile); + std::string + GetRemoteIndexPath(const std::string& file_name, int64_t slice_num) const; + private: - // collection meta - FieldDataMeta field_meta_; - - // index meta - IndexMeta index_meta_; - // local file path (abs path) std::vector local_paths_; // remote file path std::map remote_paths_to_size_; - - RemoteChunkManagerPtr rcm_; - std::string remote_root_path_; }; using DiskANNFileManagerImplPtr = std::shared_ptr; diff --git a/internal/core/src/storage/Event.cpp b/internal/core/src/storage/Event.cpp index f269d03c0e..708481778c 100644 --- a/internal/core/src/storage/Event.cpp +++ b/internal/core/src/storage/Event.cpp @@ -15,10 +15,8 @@ // limitations under the License. #include "storage/Event.h" -#include "storage/Util.h" #include "storage/PayloadReader.h" #include "storage/PayloadWriter.h" -#include "storage/FieldDataFactory.h" #include "exceptions/EasyAssert.h" #include "utils/Json.h" #include "common/Consts.h" @@ -219,19 +217,42 @@ BaseEventData::Serialize() { } else { payload_writer = std::make_unique(data_type); } - if (datatype_is_string(data_type)) { - for (size_t offset = 0; offset < field_data->get_num_rows(); ++offset) { - payload_writer->add_one_string_payload( - reinterpret_cast(field_data->RawValue(offset)), - field_data->get_element_size(offset)); + switch (data_type) { + case DataType::VARCHAR: + case DataType::STRING: { + for (size_t offset = 0; offset < field_data->get_num_rows(); + ++offset) { + auto str = static_cast( + field_data->RawValue(offset)); + payload_writer->add_one_string_payload(str->c_str(), + str->size()); + } + break; + } + case DataType::ARRAY: + case DataType::JSON: { + for (size_t offset = 0; offset < field_data->get_num_rows(); + ++offset) { + auto string_view = + static_cast(field_data->RawValue(offset)) + ->data(); + payload_writer->add_one_binary_payload( + reinterpret_cast( + std::string(string_view).c_str()), + string_view.size()); + } + break; + } + default: { + auto payload = + Payload{data_type, + static_cast(field_data->Data()), + field_data->get_num_rows(), + field_data->get_dim()}; + payload_writer->add_payload(payload); } - } else { - auto payload = Payload{data_type, - static_cast(field_data->Data()), - field_data->get_num_rows(), - field_data->get_dim()}; - payload_writer->add_payload(payload); } + payload_writer->finish(); auto payload_buffer = payload_writer->get_payload_buffer(); auto len = @@ -250,7 +271,7 @@ BaseEventData::Serialize() { BaseEvent::BaseEvent(BinlogReaderPtr reader, DataType data_type) { event_header = EventHeader(reader); auto event_data_length = - event_header.event_length_ - event_header.next_position_; + event_header.event_length_ - GetEventHeaderSize(event_header); event_data = BaseEventData(reader, event_data_length, data_type); } @@ -259,8 +280,8 @@ BaseEvent::Serialize() { auto data = event_data.Serialize(); int data_size = data.size(); - event_header.next_position_ = GetEventHeaderSize(event_header); - event_header.event_length_ = event_header.next_position_ + data_size; + event_header.event_length_ = GetEventHeaderSize(event_header) + data_size; + event_header.next_position_ = event_header.event_length_ + event_offset; auto header = event_header.Serialize(); int header_size = header.size(); @@ -281,12 +302,11 @@ DescriptorEvent::DescriptorEvent(BinlogReaderPtr reader) { std::vector DescriptorEvent::Serialize() { + event_header.event_type_ = EventType::DescriptorEvent; auto data = event_data.Serialize(); int data_size = data.size(); - event_header.event_type_ = EventType::DescriptorEvent; - event_header.next_position_ = GetEventHeaderSize(event_header); - event_header.event_length_ = event_header.next_position_ + data_size; + event_header.event_length_ = GetEventHeaderSize(event_header) + data_size; auto header = event_header.Serialize(); int header_size = header.size(); @@ -298,6 +318,8 @@ DescriptorEvent::Serialize() { memcpy(res.data() + offset, header.data(), header_size); offset += header_size; memcpy(res.data() + offset, data.data(), data_size); + offset += data_size; + event_header.next_position_ = offset; return res; } diff --git a/internal/core/src/storage/Event.h b/internal/core/src/storage/Event.h index ecb144a169..826da5cfaf 100644 --- a/internal/core/src/storage/Event.h +++ b/internal/core/src/storage/Event.h @@ -99,6 +99,7 @@ struct DescriptorEvent { struct BaseEvent { EventHeader event_header; BaseEventData event_data; + int64_t event_offset; BaseEvent() = default; explicit BaseEvent(BinlogReaderPtr reader, DataType data_type); diff --git a/internal/core/src/storage/FieldData.cpp b/internal/core/src/storage/FieldData.cpp index a3c481bb11..a08c05fb48 100644 --- a/internal/core/src/storage/FieldData.cpp +++ b/internal/core/src/storage/FieldData.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "storage/FieldData.h" +#include "common/Json.h" namespace milvus::storage { @@ -22,14 +23,28 @@ template void FieldDataImpl::FillFieldData(const void* source, ssize_t element_count) { - AssertInfo(element_count % dim_ == 0, "invalid element count"); if (element_count == 0) { return; } - AssertInfo(field_data_.size() == 0, "no empty field vector"); - field_data_.resize(element_count); - std::copy_n( - static_cast(source), element_count, field_data_.data()); + + std::lock_guard lck(tell_mutex_); + if (tell_ + element_count > get_num_rows()) { + resize_field_data(tell_ + element_count); + } + std::copy_n(static_cast(source), + element_count * dim_, + field_data_.data() + tell_ * dim_); + tell_ += element_count; +} + +template +std::pair +GetDataInfoFromArray(const std::shared_ptr array) { + AssertInfo(array->type()->id() == ArrayDataType, "inconsistent data type"); + auto typed_array = std::dynamic_pointer_cast(array); + auto element_count = array->length(); + + return std::make_pair(typed_array->raw_values(), element_count); } template @@ -37,7 +52,7 @@ void FieldDataImpl::FillFieldData( const std::shared_ptr array) { AssertInfo(array != nullptr, "null arrow array"); - auto element_count = array->length() * dim_; + auto element_count = array->length(); if (element_count == 0) { return; } @@ -54,46 +69,40 @@ FieldDataImpl::FillFieldData( return FillFieldData(values.data(), element_count); } case DataType::INT8: { - AssertInfo(array->type()->id() == arrow::Type::type::INT8, - "inconsistent data type"); - auto int8_array = - std::dynamic_pointer_cast(array); - return FillFieldData(int8_array->raw_values(), element_count); + auto array_info = + GetDataInfoFromArray( + array); + return FillFieldData(array_info.first, array_info.second); } case DataType::INT16: { - AssertInfo(array->type()->id() == arrow::Type::type::INT16, - "inconsistent data type"); - auto int16_array = - std::dynamic_pointer_cast(array); - return FillFieldData(int16_array->raw_values(), element_count); + auto array_info = + GetDataInfoFromArray(array); + return FillFieldData(array_info.first, array_info.second); } case DataType::INT32: { - AssertInfo(array->type()->id() == arrow::Type::type::INT32, - "inconsistent data type"); - auto int32_array = - std::dynamic_pointer_cast(array); - return FillFieldData(int32_array->raw_values(), element_count); + auto array_info = + GetDataInfoFromArray(array); + return FillFieldData(array_info.first, array_info.second); } case DataType::INT64: { - AssertInfo(array->type()->id() == arrow::Type::type::INT64, - "inconsistent data type"); - auto int64_array = - std::dynamic_pointer_cast(array); - return FillFieldData(int64_array->raw_values(), element_count); + auto array_info = + GetDataInfoFromArray(array); + return FillFieldData(array_info.first, array_info.second); } case DataType::FLOAT: { - AssertInfo(array->type()->id() == arrow::Type::type::FLOAT, - "inconsistent data type"); - auto float_array = - std::dynamic_pointer_cast(array); - return FillFieldData(float_array->raw_values(), element_count); + auto array_info = + GetDataInfoFromArray(array); + return FillFieldData(array_info.first, array_info.second); } case DataType::DOUBLE: { - AssertInfo(array->type()->id() == arrow::Type::type::DOUBLE, - "inconsistent data type"); - auto double_array = - std::dynamic_pointer_cast(array); - return FillFieldData(double_array->raw_values(), element_count); + auto array_info = + GetDataInfoFromArray(array); + return FillFieldData(array_info.first, array_info.second); } case DataType::STRING: case DataType::VARCHAR: { @@ -107,21 +116,25 @@ FieldDataImpl::FillFieldData( } return FillFieldData(values.data(), element_count); } - case DataType::VECTOR_FLOAT: { - AssertInfo( - array->type()->id() == arrow::Type::type::FIXED_SIZE_BINARY, - "inconsistent data type"); - auto vector_array = - std::dynamic_pointer_cast(array); - return FillFieldData(vector_array->raw_values(), element_count); + case DataType::JSON: { + AssertInfo(array->type()->id() == arrow::Type::type::BINARY, + "inconsistent data type"); + auto json_array = + std::dynamic_pointer_cast(array); + std::vector values(element_count); + for (size_t index = 0; index < element_count; ++index) { + values[index] = + Json(simdjson::padded_string(json_array->GetString(index))); + } + return FillFieldData(values.data(), element_count); } + case DataType::VECTOR_FLOAT: case DataType::VECTOR_BINARY: { - AssertInfo( - array->type()->id() == arrow::Type::type::FIXED_SIZE_BINARY, - "inconsistent data type"); - auto vector_array = - std::dynamic_pointer_cast(array); - return FillFieldData(vector_array->raw_values(), element_count); + auto array_info = + GetDataInfoFromArray( + array); + return FillFieldData(array_info.first, array_info.second); } default: { throw NotSupportedDataTypeException(GetName() + "::FillFieldData" + @@ -141,9 +154,10 @@ template class FieldDataImpl; template class FieldDataImpl; template class FieldDataImpl; template class FieldDataImpl; +template class FieldDataImpl; // vector data template class FieldDataImpl; template class FieldDataImpl; -} // namespace milvus::storage +} // namespace milvus::storage \ No newline at end of file diff --git a/internal/core/src/storage/FieldData.h b/internal/core/src/storage/FieldData.h index 20c404a61e..d65fcd5db3 100644 --- a/internal/core/src/storage/FieldData.h +++ b/internal/core/src/storage/FieldData.h @@ -27,8 +27,9 @@ template class FieldData : public FieldDataImpl { public: static_assert(IsScalar || std::is_same_v); - explicit FieldData(DataType data_type) - : FieldDataImpl::FieldDataImpl(1, data_type) { + explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0) + : FieldDataImpl::FieldDataImpl( + 1, data_type, buffered_num_rows) { } }; @@ -36,23 +37,39 @@ template <> class FieldData : public FieldDataStringImpl { public: static_assert(IsScalar || std::is_same_v); - explicit FieldData(DataType data_type) : FieldDataStringImpl(data_type) { + explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0) + : FieldDataStringImpl(data_type, buffered_num_rows) { + } +}; + +template <> +class FieldData : public FieldDataJsonImpl { + public: + static_assert(IsScalar || std::is_same_v); + explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0) + : FieldDataJsonImpl(data_type, buffered_num_rows) { } }; template <> class FieldData : public FieldDataImpl { public: - explicit FieldData(int64_t dim, DataType data_type) - : FieldDataImpl::FieldDataImpl(dim, data_type) { + explicit FieldData(int64_t dim, + DataType data_type, + int64_t buffered_num_rows = 0) + : FieldDataImpl::FieldDataImpl( + dim, data_type, buffered_num_rows) { } }; template <> class FieldData : public FieldDataImpl { public: - explicit FieldData(int64_t dim, DataType data_type) - : binary_dim_(dim), FieldDataImpl(dim / 8, data_type) { + explicit FieldData(int64_t dim, + DataType data_type, + int64_t buffered_num_rows = 0) + : binary_dim_(dim), + FieldDataImpl(dim / 8, data_type, buffered_num_rows) { Assert(dim % 8 == 0); } @@ -66,4 +83,5 @@ class FieldData : public FieldDataImpl { }; using FieldDataPtr = std::shared_ptr; -} // namespace milvus::storage + +} // namespace milvus::storage \ No newline at end of file diff --git a/internal/core/src/storage/FieldDataFactory.cpp b/internal/core/src/storage/FieldDataFactory.cpp deleted file mode 100644 index 676b057274..0000000000 --- a/internal/core/src/storage/FieldDataFactory.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the LF AI & Data foundation under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "storage/FieldDataFactory.h" -#include "storage/Exception.h" - -namespace milvus::storage { - -FieldDataPtr -FieldDataFactory::CreateFieldData(const DataType& type, const int64_t dim) { - switch (type) { - case DataType::BOOL: - return std::make_shared>(type); - case DataType::INT8: - return std::make_shared>(type); - case DataType::INT16: - return std::make_shared>(type); - case DataType::INT32: - return std::make_shared>(type); - case DataType::INT64: - return std::make_shared>(type); - case DataType::FLOAT: - return std::make_shared>(type); - case DataType::DOUBLE: - return std::make_shared>(type); - case DataType::STRING: - case DataType::VARCHAR: - return std::make_shared>(type); - case DataType::VECTOR_FLOAT: - return std::make_shared>(dim, type); - case DataType::VECTOR_BINARY: - return std::make_shared>(dim, type); - default: - throw NotSupportedDataTypeException( - GetName() + "::CreateFieldData" + " not support data type " + - datatype_name(type)); - } -} - -} // namespace milvus::storage diff --git a/internal/core/src/storage/FieldDataInterface.h b/internal/core/src/storage/FieldDataInterface.h index 4928e604cf..08a1007ecd 100644 --- a/internal/core/src/storage/FieldDataInterface.h +++ b/internal/core/src/storage/FieldDataInterface.h @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "arrow/api.h" #include "common/FieldMeta.h" @@ -53,16 +55,19 @@ class FieldDataBase { virtual int64_t Size() const = 0; + virtual int64_t + Size(ssize_t index) const = 0; + + virtual bool + IsFull() const = 0; + public: - virtual int + virtual int64_t get_num_rows() const = 0; virtual int64_t get_dim() const = 0; - virtual int64_t - get_element_size(ssize_t offset) const = 0; - DataType get_data_type() const { return data_type_; @@ -86,8 +91,14 @@ class FieldDataImpl : public FieldDataBase { operator=(const FieldDataImpl&) = delete; public: - explicit FieldDataImpl(ssize_t dim, DataType data_type) - : FieldDataBase(data_type), dim_(is_scalar ? 1 : dim) { + explicit FieldDataImpl(ssize_t dim, + DataType data_type, + int64_t buffered_num_rows = 0) + : FieldDataBase(data_type), + dim_(is_scalar ? 1 : dim), + num_rows_(buffered_num_rows), + tell_(0) { + field_data_.resize(num_rows_ * dim_); } void @@ -108,20 +119,54 @@ class FieldDataImpl : public FieldDataBase { const void* RawValue(ssize_t offset) const override { + AssertInfo(offset < get_num_rows(), + "field data subscript out of range"); + AssertInfo(offset < get_tell(), + "subscript position don't has valid value"); return &field_data_[offset]; } int64_t Size() const override { - return sizeof(Type) * field_data_.size(); + return sizeof(Type) * get_tell() * dim_; + } + + int64_t + Size(ssize_t offset) const override { + AssertInfo(offset < get_num_rows(), + "field data subscript out of range"); + AssertInfo(offset < get_tell(), + "subscript position don't has valid value"); + return sizeof(Type) * dim_; + } + + bool + IsFull() const override { + auto buffered_num_rows = get_num_rows(); + auto filled_num_rows = get_tell(); + return buffered_num_rows == filled_num_rows; } public: - int + int64_t get_num_rows() const override { - auto len = field_data_.size(); - AssertInfo(len % dim_ == 0, "field data size not aligned"); - return len / dim_; + std::shared_lock lck(num_rows_mutex_); + return num_rows_; + } + + void + resize_field_data(int64_t num_rows) { + std::lock_guard lck(num_rows_mutex_); + if (num_rows > num_rows_) { + num_rows_ = num_rows; + field_data_.resize(num_rows_ * dim_); + } + } + + int64_t + get_tell() const { + std::shared_lock lck(tell_mutex_); + return tell_; } int64_t @@ -129,13 +174,12 @@ class FieldDataImpl : public FieldDataBase { return dim_; } - int64_t - get_element_size(ssize_t offset) const override { - return sizeof(Type) * dim_; - } - protected: Chunk field_data_; + int64_t num_rows_; + mutable std::shared_mutex num_rows_mutex_; + int64_t tell_; + mutable std::shared_mutex tell_mutex_; private: const ssize_t dim_; @@ -143,30 +187,54 @@ class FieldDataImpl : public FieldDataBase { class FieldDataStringImpl : public FieldDataImpl { public: - explicit FieldDataStringImpl(DataType data_type) - : FieldDataImpl(1, data_type) { - } - - const void* - RawValue(ssize_t offset) const { - return field_data_[offset].c_str(); + explicit FieldDataStringImpl(DataType data_type, int64_t total_num_rows = 0) + : FieldDataImpl(1, data_type, total_num_rows) { } int64_t Size() const { int64_t data_size = 0; - for (size_t offset = 0; offset < field_data_.size(); ++offset) { - data_size += get_element_size(offset); + for (size_t offset = 0; offset < get_tell(); ++offset) { + data_size += field_data_[offset].size(); } return data_size; } - public: int64_t - get_element_size(ssize_t offset) const { + Size(ssize_t offset) const { + AssertInfo(offset < get_num_rows(), + "field data subscript out of range"); + AssertInfo(offset < get_tell(), + "subscript position don't has valid value"); return field_data_[offset].size(); } }; +class FieldDataJsonImpl : public FieldDataImpl { + public: + explicit FieldDataJsonImpl(DataType data_type, int64_t total_num_rows = 0) + : FieldDataImpl(1, data_type, total_num_rows) { + } + + int64_t + Size() const { + int64_t data_size = 0; + for (size_t offset = 0; offset < get_tell(); ++offset) { + data_size += field_data_[offset].data().size(); + } + + return data_size; + } + + int64_t + Size(ssize_t offset) const { + AssertInfo(offset < get_num_rows(), + "field data subscript out of range"); + AssertInfo(offset < get_tell(), + "subscript position don't has valid value"); + return field_data_[offset].data().size(); + } +}; + } // namespace milvus::storage diff --git a/internal/core/src/storage/FileManager.h b/internal/core/src/storage/FileManager.h index 520c772ae0..5daa6cd555 100644 --- a/internal/core/src/storage/FileManager.h +++ b/internal/core/src/storage/FileManager.h @@ -21,10 +21,45 @@ #include #include "knowhere/file_manager.h" +#include "common/Consts.h" +#include "storage/ChunkManager.h" +#include "storage/Types.h" +#include "log/Log.h" namespace milvus::storage { +#define FILEMANAGER_TRY try { +#define FILEMANAGER_CATCH \ + } \ + catch (LocalChunkManagerException & e) { \ + LOG_SEGCORE_ERROR_ << "LocalChunkManagerException:" << e.what(); \ + return false; \ + } \ + catch (MinioException & e) { \ + LOG_SEGCORE_ERROR_ << "milvus::storage::MinioException:" << e.what(); \ + return false; \ + } \ + catch (DiskANNFileManagerException & e) { \ + LOG_SEGCORE_ERROR_ << "milvus::storage::DiskANNFileManagerException:" \ + << e.what(); \ + return false; \ + } \ + catch (ArrowException & e) { \ + LOG_SEGCORE_ERROR_ << "milvus::storage::ArrowException:" << e.what(); \ + return false; \ + } \ + catch (std::exception & e) { \ + LOG_SEGCORE_ERROR_ << "Exception:" << e.what(); \ + return false; +#define FILEMANAGER_END } + class FileManagerImpl : public knowhere::FileManager { + public: + explicit FileManagerImpl(const FieldDataMeta& field_mata, + IndexMeta index_meta) + : field_meta_(field_mata), index_meta_(std::move(index_meta)) { + } + public: /** * @brief Load a file to the local disk, so we can use stl lib to operate it. @@ -61,6 +96,37 @@ class FileManagerImpl : public knowhere::FileManager { */ virtual bool RemoveFile(const std::string& filename) noexcept = 0; + + public: + virtual std::string + GetName() const = 0; + + virtual FieldDataMeta + GetFieldDataMeta() const { + return field_meta_; + } + + virtual IndexMeta + GetIndexMeta() const { + return index_meta_; + } + + virtual std::string + GetRemoteIndexObjectPrefix() const { + return rcm_->GetRootPath() + "/" + std::string(INDEX_ROOT_PATH) + "/" + + std::to_string(index_meta_.build_id) + "/" + + std::to_string(index_meta_.index_version) + "/" + + std::to_string(field_meta_.partition_id) + "/" + + std::to_string(field_meta_.segment_id); + } + + protected: + // collection meta + FieldDataMeta field_meta_; + + // index meta + IndexMeta index_meta_; + ChunkManagerPtr rcm_; }; using FileManagerImplPtr = std::shared_ptr; diff --git a/internal/core/src/storage/IndexData.cpp b/internal/core/src/storage/IndexData.cpp index b84243d863..fb448a362c 100644 --- a/internal/core/src/storage/IndexData.cpp +++ b/internal/core/src/storage/IndexData.cpp @@ -51,20 +51,6 @@ IndexData::serialize_to_remote_file() { AssertInfo(index_meta_.has_value(), "index meta not exist"); AssertInfo(field_data_ != nullptr, "empty field data"); - // create index event - IndexEvent index_event; - auto& index_event_data = index_event.event_data; - index_event_data.start_timestamp = time_range_.first; - index_event_data.end_timestamp = time_range_.second; - index_event_data.field_data = field_data_; - - auto& index_event_header = index_event.event_header; - index_event_header.event_type_ = EventType::IndexFileEvent; - // TODO :: set timestamps - index_event_header.timestamp_ = 0; - - // serialize insert event - auto index_event_bytes = index_event.Serialize(); DataType data_type = field_data_->get_data_type(); // create descriptor event @@ -96,6 +82,22 @@ IndexData::serialize_to_remote_file() { // serialize descriptor event data auto des_event_bytes = descriptor_event.Serialize(); + // create index event + IndexEvent index_event; + index_event.event_offset = des_event_bytes.size(); + auto& index_event_data = index_event.event_data; + index_event_data.start_timestamp = time_range_.first; + index_event_data.end_timestamp = time_range_.second; + index_event_data.field_data = field_data_; + + auto& index_event_header = index_event.event_header; + index_event_header.event_type_ = EventType::IndexFileEvent; + // TODO :: set timestamps + index_event_header.timestamp_ = 0; + + // serialize insert event + auto index_event_bytes = index_event.Serialize(); + des_event_bytes.insert(des_event_bytes.end(), index_event_bytes.begin(), index_event_bytes.end()); diff --git a/internal/core/src/storage/InsertData.cpp b/internal/core/src/storage/InsertData.cpp index 2199e6b2ac..91cbf093b1 100644 --- a/internal/core/src/storage/InsertData.cpp +++ b/internal/core/src/storage/InsertData.cpp @@ -47,20 +47,6 @@ InsertData::serialize_to_remote_file() { AssertInfo(field_data_meta_.has_value(), "field data not exist"); AssertInfo(field_data_ != nullptr, "empty field data"); - // create insert event - InsertEvent insert_event; - auto& insert_event_data = insert_event.event_data; - insert_event_data.start_timestamp = time_range_.first; - insert_event_data.end_timestamp = time_range_.second; - insert_event_data.field_data = field_data_; - - auto& insert_event_header = insert_event.event_header; - // TODO :: set timestamps - insert_event_header.timestamp_ = 0; - insert_event_header.event_type_ = EventType::InsertEvent; - - // serialize insert event - auto insert_event_bytes = insert_event.Serialize(); DataType data_type = field_data_->get_data_type(); // create descriptor event @@ -90,6 +76,22 @@ InsertData::serialize_to_remote_file() { // serialize descriptor event data auto des_event_bytes = descriptor_event.Serialize(); + // create insert event + InsertEvent insert_event; + insert_event.event_offset = des_event_bytes.size(); + auto& insert_event_data = insert_event.event_data; + insert_event_data.start_timestamp = time_range_.first; + insert_event_data.end_timestamp = time_range_.second; + insert_event_data.field_data = field_data_; + + auto& insert_event_header = insert_event.event_header; + // TODO :: set timestamps + insert_event_header.timestamp_ = 0; + insert_event_header.event_type_ = EventType::InsertEvent; + + // serialize insert event + auto insert_event_bytes = insert_event.Serialize(); + des_event_bytes.insert(des_event_bytes.end(), insert_event_bytes.begin(), insert_event_bytes.end()); diff --git a/internal/core/src/storage/LocalChunkManager.cpp b/internal/core/src/storage/LocalChunkManager.cpp index ebcf43ae82..03d5cfbbd8 100644 --- a/internal/core/src/storage/LocalChunkManager.cpp +++ b/internal/core/src/storage/LocalChunkManager.cpp @@ -103,6 +103,11 @@ void LocalChunkManager::Write(const std::string& absPathStr, void* buf, uint64_t size) { + boost::filesystem::path absPath(absPathStr); + // if filepath not exists, will create this file automatically + // ensure upper directory exist firstly + boost::filesystem::create_directories(absPath.parent_path()); + std::ofstream outfile; outfile.open(absPathStr.data(), std::ios_base::binary); if (outfile.fail()) { @@ -124,6 +129,11 @@ LocalChunkManager::Write(const std::string& absPathStr, uint64_t offset, void* buf, uint64_t size) { + boost::filesystem::path absPath(absPathStr); + // if filepath not exists, will create this file automatically + // ensure upper directory exist firstly + boost::filesystem::create_directories(absPath.parent_path()); + std::ofstream outfile; outfile.open( absPathStr.data(), diff --git a/internal/core/src/storage/LocalChunkManager.h b/internal/core/src/storage/LocalChunkManager.h index f714799a6f..de49906853 100644 --- a/internal/core/src/storage/LocalChunkManager.h +++ b/internal/core/src/storage/LocalChunkManager.h @@ -21,7 +21,6 @@ #include #include "storage/ChunkManager.h" -#include "config/ConfigChunkManager.h" namespace milvus::storage { @@ -30,7 +29,7 @@ namespace milvus::storage { * that inherited from ChunkManager */ class LocalChunkManager : public ChunkManager { - private: + public: explicit LocalChunkManager(const std::string& path) : path_prefix_(path) { } @@ -39,14 +38,6 @@ class LocalChunkManager : public ChunkManager { operator=(const LocalChunkManager&); public: - static LocalChunkManager& - GetInstance() { - // thread-safe enough after c++ 11 - static LocalChunkManager instance( - ChunkMangerConfig::GetLocalRootPath()); - return instance; - } - virtual ~LocalChunkManager() { } @@ -110,16 +101,11 @@ class LocalChunkManager : public ChunkManager { return "LocalChunkManager"; } - inline std::string - GetPathPrefix() { + virtual std::string + GetRootPath() const { return path_prefix_; } - inline void - SetPathPrefix(const std::string& path) { - path_prefix_ = path; - } - bool CreateFile(const std::string& filepath); diff --git a/internal/core/src/storage/LocalChunkManagerSingleton.h b/internal/core/src/storage/LocalChunkManagerSingleton.h new file mode 100644 index 0000000000..c9975ec2bb --- /dev/null +++ b/internal/core/src/storage/LocalChunkManagerSingleton.h @@ -0,0 +1,67 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "storage/ChunkManager.h" +#include "storage/LocalChunkManager.h" + +namespace milvus::storage { + +class LocalChunkManagerSingleton { + private: + LocalChunkManagerSingleton() { + } + + public: + LocalChunkManagerSingleton(const LocalChunkManagerSingleton&) = delete; + LocalChunkManagerSingleton& + operator=(const LocalChunkManagerSingleton&) = delete; + + static LocalChunkManagerSingleton& + GetInstance() { + static LocalChunkManagerSingleton instance; + return instance; + } + + void + Init(std::string root_path) { + std::unique_lock lck(mutex_); + if (lcm_ == nullptr) { + lcm_ = std::make_shared(root_path); + } + } + + void + Release() { + std::unique_lock lck(mutex_); + lcm_ = nullptr; + } + + LocalChunkManagerSPtr + GetChunkManager() { + return lcm_; + } + + private: + mutable std::shared_mutex mutex_; + LocalChunkManagerSPtr lcm_ = nullptr; +}; + +} // namespace milvus::storage \ No newline at end of file diff --git a/internal/core/src/storage/MemFileManagerImpl.cpp b/internal/core/src/storage/MemFileManagerImpl.cpp new file mode 100644 index 0000000000..e8a463a196 --- /dev/null +++ b/internal/core/src/storage/MemFileManagerImpl.cpp @@ -0,0 +1,169 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "storage/MemFileManagerImpl.h" + +#include "storage/Util.h" +#include "common/Common.h" + +namespace milvus::storage { + +MemFileManagerImpl::MemFileManagerImpl(const FieldDataMeta& field_mata, + IndexMeta index_meta, + ChunkManagerPtr remote_chunk_manager) + : FileManagerImpl(field_mata, index_meta) { + rcm_ = remote_chunk_manager; +} + +bool +MemFileManagerImpl::AddFile(const std::string& filename /* unused */) noexcept { + return false; +} + +bool +MemFileManagerImpl::AddFile(const BinarySet& binary_set) noexcept { + std::vector data_slices; + std::vector slice_sizes; + std::vector slice_names; + + auto AddBatchIndexFiles = [&]() { + auto res = PutIndexData(rcm_.get(), + data_slices, + slice_sizes, + slice_names, + field_meta_, + index_meta_); + for (auto& [file, size] : res) { + remote_paths_to_size_[file] = size; + } + }; + + auto remotePrefix = GetRemoteIndexObjectPrefix(); + int64_t batch_size = 0; + for (auto iter = binary_set.binary_map_.begin(); + iter != binary_set.binary_map_.end(); + iter++) { + if (batch_size >= DEFAULT_FIELD_MAX_MEMORY_LIMIT) { + AddBatchIndexFiles(); + data_slices.clear(); + slice_sizes.clear(); + slice_names.clear(); + batch_size = 0; + } + + data_slices.emplace_back(iter->second->data.get()); + slice_sizes.emplace_back(iter->second->size); + slice_names.emplace_back(remotePrefix + "/" + iter->first); + batch_size += iter->second->size; + } + + if (data_slices.size() > 0) { + AddBatchIndexFiles(); + } + + return true; +} + +bool +MemFileManagerImpl::LoadFile(const std::string& filename) noexcept { + return true; +} + +std::map +MemFileManagerImpl::LoadIndexToMemory( + const std::vector& remote_files) { + std::map file_to_index_data; + auto parallel_degree = + uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE); + std::vector batch_files; + + auto LoadBatchIndexFiles = [&]() { + auto index_datas = GetObjectData(rcm_.get(), batch_files); + for (size_t idx = 0; idx < batch_files.size(); ++idx) { + auto file_name = + batch_files[idx].substr(batch_files[idx].find_last_of("/") + 1); + file_to_index_data[file_name] = index_datas[idx]; + } + }; + + for (auto& file : remote_files) { + if (batch_files.size() >= parallel_degree) { + LoadBatchIndexFiles(); + batch_files.clear(); + } + batch_files.emplace_back(file); + } + + if (batch_files.size() > 0) { + LoadBatchIndexFiles(); + } + + AssertInfo(file_to_index_data.size() == remote_files.size(), + "inconsistent file num and index data num!"); + return file_to_index_data; +} + +std::vector +MemFileManagerImpl::CacheRawDataToMemory( + std::vector remote_files) { + std::sort(remote_files.begin(), + remote_files.end(), + [](const std::string& a, const std::string& b) { + return std::stol(a.substr(a.find_last_of("/") + 1)) < + std::stol(b.substr(b.find_last_of("/") + 1)); + }); + + auto parallel_degree = + uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE); + std::vector batch_files; + std::vector field_datas; + + auto FetchRawData = [&]() { + auto raw_datas = GetObjectData(rcm_.get(), batch_files); + for (auto& data : raw_datas) { + field_datas.emplace_back(data); + } + }; + + for (auto& file : remote_files) { + if (batch_files.size() >= parallel_degree) { + FetchRawData(); + batch_files.clear(); + } + batch_files.emplace_back(file); + } + if (batch_files.size() > 0) { + FetchRawData(); + } + + AssertInfo(field_datas.size() == remote_files.size(), + "inconsistent file num and raw data num!"); + return field_datas; +} + +std::optional +MemFileManagerImpl::IsExisted(const std::string& filename) noexcept { + // TODO: implement this interface + return false; +} + +bool +MemFileManagerImpl::RemoveFile(const std::string& filename) noexcept { + // TODO: implement this interface + return false; +} + +} // namespace milvus::storage \ No newline at end of file diff --git a/internal/core/src/storage/MemFileManagerImpl.h b/internal/core/src/storage/MemFileManagerImpl.h new file mode 100644 index 0000000000..df8177f5ee --- /dev/null +++ b/internal/core/src/storage/MemFileManagerImpl.h @@ -0,0 +1,75 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "storage/IndexData.h" +#include "storage/FileManager.h" +#include "storage/ChunkManager.h" + +namespace milvus::storage { + +class MemFileManagerImpl : public FileManagerImpl { + public: + explicit MemFileManagerImpl(const FieldDataMeta& field_mata, + IndexMeta index_meta, + ChunkManagerPtr remote_chunk_manager); + + virtual bool + LoadFile(const std::string& filename) noexcept; + + virtual bool + AddFile(const std::string& filename /* unused */) noexcept; + + virtual std::optional + IsExisted(const std::string& filename) noexcept; + + virtual bool + RemoveFile(const std::string& filename) noexcept; + + public: + virtual std::string + GetName() const { + return "MemIndexFileManagerImpl"; + } + + std::map + LoadIndexToMemory(const std::vector& remote_files); + + std::vector + CacheRawDataToMemory(std::vector remote_files); + + bool + AddFile(const BinarySet& binary_set) noexcept; + + std::map + GetRemotePathsToFileSize() const { + return remote_paths_to_size_; + } + + private: + // remote file path + std::map remote_paths_to_size_; +}; + +using MemFileManagerImplPtr = std::shared_ptr; + +} // namespace milvus::storage \ No newline at end of file diff --git a/internal/core/src/storage/MinioChunkManager.cpp b/internal/core/src/storage/MinioChunkManager.cpp index 3967cd70a1..491332603f 100644 --- a/internal/core/src/storage/MinioChunkManager.cpp +++ b/internal/core/src/storage/MinioChunkManager.cpp @@ -206,6 +206,7 @@ MinioChunkManager::BuildGoogleCloudClient( MinioChunkManager::MinioChunkManager(const StorageConfig& storage_config) : default_bucket_name_(storage_config.bucket_name) { + remote_root_path_ = storage_config.root_path; RemoteStorageType storageType; if (storage_config.address.find("google") != std::string::npos) { storageType = RemoteStorageType::GOOGLE_CLOUD; diff --git a/internal/core/src/storage/MinioChunkManager.h b/internal/core/src/storage/MinioChunkManager.h index 60011fcf51..86dfb20180 100644 --- a/internal/core/src/storage/MinioChunkManager.h +++ b/internal/core/src/storage/MinioChunkManager.h @@ -30,12 +30,12 @@ #include #include #include + #include #include #include #include -#include "config/ConfigChunkManager.h" #include "storage/ChunkManager.h" #include "storage/Exception.h" #include "storage/Types.h" @@ -47,7 +47,7 @@ enum class RemoteStorageType { S3 = 0, GOOGLE_CLOUD = 1, ALIYUN_CLOUD = 2 }; /** * @brief This MinioChunkManager is responsible for read and write file in S3. */ -class MinioChunkManager : public RemoteChunkManager { +class MinioChunkManager : public ChunkManager { public: explicit MinioChunkManager(const StorageConfig& storage_config); @@ -99,6 +99,11 @@ class MinioChunkManager : public RemoteChunkManager { return "MinioChunkManager"; } + virtual std::string + GetRootPath() const { + return remote_root_path_; + } + inline std::string GetBucketName() { return default_bucket_name_; @@ -163,6 +168,7 @@ class MinioChunkManager : public RemoteChunkManager { static std::mutex client_mutex_; std::shared_ptr client_; std::string default_bucket_name_; + std::string remote_root_path_; }; using MinioChunkManagerPtr = std::unique_ptr; diff --git a/internal/core/src/storage/PayloadReader.cpp b/internal/core/src/storage/PayloadReader.cpp index 7ace4288e2..8f9b190319 100644 --- a/internal/core/src/storage/PayloadReader.cpp +++ b/internal/core/src/storage/PayloadReader.cpp @@ -16,46 +16,72 @@ #include "storage/PayloadReader.h" #include "exceptions/EasyAssert.h" -#include "storage/FieldDataFactory.h" #include "storage/Util.h" +#include "parquet/column_reader.h" +#include "arrow/io/api.h" +#include "arrow/status.h" +#include "parquet/arrow/reader.h" namespace milvus::storage { -PayloadReader::PayloadReader(std::shared_ptr input, - DataType data_type) - : column_type_(data_type) { - init(std::move(input)); -} PayloadReader::PayloadReader(const uint8_t* data, int length, DataType data_type) : column_type_(data_type) { - auto input = std::make_shared(data, length); + auto input = std::make_shared(data, length); init(input); } void -PayloadReader::init(std::shared_ptr input) { - auto mem_pool = arrow::default_memory_pool(); - // TODO :: Stream read file data, avoid copying - std::unique_ptr reader; - auto st = parquet::arrow::OpenFile(input, mem_pool, &reader); - AssertInfo(st.ok(), "failed to get arrow file reader"); - std::shared_ptr table; - st = reader->ReadTable(&table); - AssertInfo(st.ok(), "failed to get reader data to arrow table"); - auto column = table->column(0); - AssertInfo(column != nullptr, "returned arrow column is null"); - AssertInfo(column->chunks().size() == 1, - "arrow chunk size in arrow column should be 1"); - auto array = column->chunk(0); - AssertInfo(array != nullptr, "empty arrow array of PayloadReader"); +PayloadReader::init(std::shared_ptr input) { + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + // Configure general Parquet reader settings + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); + + // Configure Arrow-specific Parquet reader settings + auto arrow_reader_props = parquet::ArrowReaderProperties(); + arrow_reader_props.set_batch_size(128 * 1024); // default 64 * 1024 + arrow_reader_props.set_pre_buffer(false); + + parquet::arrow::FileReaderBuilder reader_builder; + auto st = reader_builder.Open(input, reader_properties); + AssertInfo(st.ok(), "file to read file"); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); + + std::unique_ptr arrow_reader; + st = reader_builder.Build(&arrow_reader); + AssertInfo(st.ok(), "build file reader"); + + int64_t column_index = 0; + auto file_meta = arrow_reader->parquet_reader()->metadata(); + // LOG_SEGCORE_INFO_ << "serialized parquet metadata, num row group " << + // std::to_string(file_meta->num_row_groups()) + // << ", num column " << std::to_string(file_meta->num_columns()) << ", num rows " + // << std::to_string(file_meta->num_rows()) << ", type width " + // << std::to_string(file_meta->schema()->Column(column_index)->type_length()); dim_ = datatype_is_vector(column_type_) - ? GetDimensionFromArrowArray(array, column_type_) + ? GetDimensionFromFileMetaData( + file_meta->schema()->Column(column_index), column_type_) : 1; - field_data_ = - FieldDataFactory::GetInstance().CreateFieldData(column_type_, dim_); - field_data_->FillFieldData(array); + auto total_num_rows = file_meta->num_rows(); + + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + st = arrow_reader->GetRecordBatchReader(&rb_reader); + AssertInfo(st.ok(), "get record batch reader"); + + field_data_ = CreateFieldData(column_type_, dim_, total_num_rows); + for (arrow::Result> maybe_batch : + *rb_reader) { + AssertInfo(maybe_batch.ok(), "get batch record success"); + auto array = maybe_batch.ValueOrDie()->column(column_index); + field_data_->FillFieldData(array); + } + AssertInfo(field_data_->IsFull(), "field data hasn't been filled done"); + // LOG_SEGCORE_INFO_ << "Peak arrow memory pool size " << pool->max_memory(); } } // namespace milvus::storage diff --git a/internal/core/src/storage/PayloadReader.h b/internal/core/src/storage/PayloadReader.h index da87cff68c..90e63a20ec 100644 --- a/internal/core/src/storage/PayloadReader.h +++ b/internal/core/src/storage/PayloadReader.h @@ -26,15 +26,12 @@ namespace milvus::storage { class PayloadReader { public: - explicit PayloadReader(std::shared_ptr input, - DataType data_type); - explicit PayloadReader(const uint8_t* data, int length, DataType data_type); ~PayloadReader() = default; void - init(std::shared_ptr input); + init(std::shared_ptr buffer); const FieldDataPtr get_field_data() const { diff --git a/internal/core/src/storage/PayloadStream.h b/internal/core/src/storage/PayloadStream.h index 27071111dd..c23c781636 100644 --- a/internal/core/src/storage/PayloadStream.h +++ b/internal/core/src/storage/PayloadStream.h @@ -32,7 +32,7 @@ class PayloadInputStream; struct Payload { DataType data_type; const uint8_t* raw_data; - int rows; + int64_t rows; std::optional dimension; }; diff --git a/internal/core/src/storage/RemoteChunkManagerSingleton.h b/internal/core/src/storage/RemoteChunkManagerSingleton.h new file mode 100644 index 0000000000..7c72cbf8c1 --- /dev/null +++ b/internal/core/src/storage/RemoteChunkManagerSingleton.h @@ -0,0 +1,66 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "storage/Util.h" + +namespace milvus::storage { + +class RemoteChunkManagerSingleton { + private: + RemoteChunkManagerSingleton() { + } + + public: + RemoteChunkManagerSingleton(const RemoteChunkManagerSingleton&) = delete; + RemoteChunkManagerSingleton& + operator=(const RemoteChunkManagerSingleton&) = delete; + + static RemoteChunkManagerSingleton& + GetInstance() { + static RemoteChunkManagerSingleton instance; + return instance; + } + + void + Init(const StorageConfig& storage_config) { + std::unique_lock lck(mutex_); + if (rcm_ == nullptr) { + rcm_ = CreateChunkManager(storage_config); + } + } + + void + Release() { + std::unique_lock lck(mutex_); + rcm_ = nullptr; + } + + ChunkManagerPtr + GetRemoteChunkManager() { + return rcm_; + } + + private: + mutable std::shared_mutex mutex_; + ChunkManagerPtr rcm_ = nullptr; +}; + +} // namespace milvus::storage diff --git a/internal/core/src/storage/SafeQueue.h b/internal/core/src/storage/SafeQueue.h index 1f536837ff..6d9dd6ef38 100644 --- a/internal/core/src/storage/SafeQueue.h +++ b/internal/core/src/storage/SafeQueue.h @@ -36,7 +36,7 @@ class SafeQueue { return queue_.empty(); } - void + size_t size() { std::shared_lock lock(mutex_); return queue_.size(); diff --git a/internal/core/src/storage/ThreadPool.h b/internal/core/src/storage/ThreadPool.h index 4f0c70d224..3fcadf0eb2 100644 --- a/internal/core/src/storage/ThreadPool.h +++ b/internal/core/src/storage/ThreadPool.h @@ -34,7 +34,7 @@ namespace milvus { class ThreadPool { public: explicit ThreadPool(const int thread_core_coefficient) : shutdown_(false) { - auto thread_num = cpu_num * thread_core_coefficient; + auto thread_num = CPU_NUM * thread_core_coefficient; LOG_SEGCORE_INFO_ << "Thread pool's worker num:" << thread_num; threads_ = std::vector(thread_num); Init(); @@ -46,7 +46,7 @@ class ThreadPool { static ThreadPool& GetInstance() { - static ThreadPool pool(thread_core_coefficient); + static ThreadPool pool(THREAD_CORE_COEFFICIENT); return pool; } diff --git a/internal/core/src/storage/Types.h b/internal/core/src/storage/Types.h index 2a8ef78021..57ef9164ad 100644 --- a/internal/core/src/storage/Types.h +++ b/internal/core/src/storage/Types.h @@ -86,7 +86,7 @@ struct StorageConfig { std::string bucket_name = "a-bucket"; std::string access_key_id = "minioadmin"; std::string access_key_value = "minioadmin"; - std::string remote_root_path = "files"; + std::string root_path = "files"; std::string storage_type = "minio"; std::string iam_endpoint = ""; bool useSSL = false; diff --git a/internal/core/src/storage/Util.cpp b/internal/core/src/storage/Util.cpp index b92cc0d93e..fcb1b32df3 100644 --- a/internal/core/src/storage/Util.cpp +++ b/internal/core/src/storage/Util.cpp @@ -19,15 +19,18 @@ #include "arrow/type_fwd.h" #include "exceptions/EasyAssert.h" #include "common/Consts.h" -#include "config/ConfigChunkManager.h" -#include "storage/parquet_c.h" - -#ifdef BUILD_DISK_ANN +#include "storage/FieldData.h" +#include "storage/ThreadPool.h" +#include "storage/LocalChunkManager.h" +#include "storage/MinioChunkManager.h" +#include "storage/MemFileManagerImpl.h" #include "storage/DiskFileManagerImpl.h" -#endif namespace milvus::storage { +std::map ChunkManagerType_Map = { + {"local", ChunkManagerType::Local}, {"minio", ChunkManagerType::Minio}}; + StorageType ReadMediumType(BinlogReaderPtr reader) { AssertInfo(reader->Tell() == 0, @@ -273,6 +276,21 @@ CreateArrowSchema(DataType data_type, int dim) { } } +int +GetDimensionFromFileMetaData(const parquet::ColumnDescriptor* schema, + DataType data_type) { + switch (data_type) { + case DataType::VECTOR_FLOAT: { + return schema->type_length() / sizeof(float); + } + case DataType::VECTOR_BINARY: { + return schema->type_length() * 8; + } + default: + PanicInfo("unsupported data type"); + } +} + int GetDimensionFromArrowArray(std::shared_ptr data, DataType data_type) { @@ -299,58 +317,242 @@ GetDimensionFromArrowArray(std::shared_ptr data, } std::string -GenLocalIndexPathPrefix(int64_t build_id, int64_t index_version) { - return milvus::ChunkMangerConfig::GetLocalRootPath() + "/" + - std::string(INDEX_ROOT_PATH) + "/" + std::to_string(build_id) + "/" + - std::to_string(index_version) + "/"; +GenIndexPathPrefix(ChunkManagerPtr cm, + int64_t build_id, + int64_t index_version) { + return cm->GetRootPath() + "/" + std::string(INDEX_ROOT_PATH) + "/" + + std::to_string(build_id) + "/" + std::to_string(index_version) + "/"; } std::string -GetLocalIndexPathPrefixWithBuildID(int64_t build_id) { - return milvus::ChunkMangerConfig::GetLocalRootPath() + "/" + - std::string(INDEX_ROOT_PATH) + "/" + std::to_string(build_id); +GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id) { + return cm->GetRootPath() + "/" + std::string(INDEX_ROOT_PATH) + "/" + + std::to_string(build_id); } std::string -GenFieldRawDataPathPrefix(int64_t segment_id, int64_t field_id) { - return milvus::ChunkMangerConfig::GetLocalRootPath() + "/" + - std::string(RAWDATA_ROOT_PATH) + "/" + std::to_string(segment_id) + - "/" + std::to_string(field_id) + "/"; +GenFieldRawDataPathPrefix(ChunkManagerPtr cm, + int64_t segment_id, + int64_t field_id) { + return cm->GetRootPath() + "/" + std::string(RAWDATA_ROOT_PATH) + "/" + + std::to_string(segment_id) + "/" + std::to_string(field_id) + "/"; } std::string -GetSegmentRawDataPathPrefix(int64_t segment_id) { - return milvus::ChunkMangerConfig::GetLocalRootPath() + "/" + - std::string(RAWDATA_ROOT_PATH) + "/" + std::to_string(segment_id); +GetSegmentRawDataPathPrefix(ChunkManagerPtr cm, int64_t segment_id) { + return cm->GetRootPath() + "/" + std::string(RAWDATA_ROOT_PATH) + "/" + + std::to_string(segment_id); } -std::vector -DISK_LIST() { - static std::vector ret{ - knowhere::IndexEnum::INDEX_DISKANN, - }; - return ret; +std::unique_ptr +DownloadAndDecodeRemoteFile(ChunkManager* chunk_manager, + const std::string& file) { + auto fileSize = chunk_manager->Size(file); + auto buf = std::shared_ptr(new uint8_t[fileSize]); + chunk_manager->Read(file, buf.get(), fileSize); + + return DeserializeFileData(buf, fileSize); } -bool -is_in_disk_list(const IndexType& index_type) { - return is_in_list(index_type, DISK_LIST); +std::pair +EncodeAndUploadIndexSlice(ChunkManager* chunk_manager, + uint8_t* buf, + int64_t batch_size, + IndexMeta index_meta, + FieldDataMeta field_meta, + std::string object_key) { + auto field_data = CreateFieldData(DataType::INT8); + field_data->FillFieldData(buf, batch_size); + auto indexData = std::make_shared(field_data); + indexData->set_index_meta(index_meta); + indexData->SetFieldDataMeta(field_meta); + auto serialized_index_data = indexData->serialize_to_remote_file(); + auto serialized_index_size = serialized_index_data.size(); + chunk_manager->Write( + object_key, serialized_index_data.data(), serialized_index_size); + return std::make_pair(std::move(object_key), serialized_index_size); +} + +// /** +// * Returns the current resident set size (physical memory use) measured +// * in bytes, or zero if the value cannot be determined on this OS. +// */ +// size_t +// getCurrentRSS() { +// #if defined(_WIN32) +// /* Windows -------------------------------------------------- */ +// PROCESS_MEMORY_COUNTERS info; +// GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info)); +// return (size_t)info.WorkingSetSize; + +// #elif defined(__APPLE__) && defined(__MACH__) +// /* OSX ------------------------------------------------------ */ +// struct mach_task_basic_info info; +// mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; +// if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) != KERN_SUCCESS) +// return (size_t)0L; /* Can't access? */ +// return (size_t)info.resident_size; + +// #elif defined(__linux__) || defined(__linux) || defined(linux) || defined(__gnu_linux__) +// /* Linux ---------------------------------------------------- */ +// long rss = 0L; +// FILE* fp = NULL; +// if ((fp = fopen("/proc/self/statm", "r")) == NULL) +// return (size_t)0L; /* Can't open? */ +// if (fscanf(fp, "%*s%ld", &rss) != 1) { +// fclose(fp); +// return (size_t)0L; /* Can't read? */ +// } +// fclose(fp); +// return (size_t)rss * (size_t)sysconf(_SC_PAGESIZE); + +// #else +// /* AIX, BSD, Solaris, and Unknown OS ------------------------ */ +// return (size_t)0L; /* Unsupported. */ +// #endif +// } + +std::vector +GetObjectData(ChunkManager* remote_chunk_manager, + const std::vector& remote_files) { + auto& pool = ThreadPool::GetInstance(); + std::vector>> futures; + for (auto& file : remote_files) { + futures.emplace_back(pool.Submit( + DownloadAndDecodeRemoteFile, remote_chunk_manager, file)); + } + + std::vector datas; + for (int i = 0; i < futures.size(); ++i) { + auto res = futures[i].get(); + datas.emplace_back(res->GetFieldData()); + } + + ReleaseArrowUnused(); + return datas; +} + +std::map +PutIndexData(ChunkManager* remote_chunk_manager, + const std::vector& data_slices, + const std::vector& slice_sizes, + const std::vector& slice_names, + FieldDataMeta& field_meta, + IndexMeta& index_meta) { + auto& pool = ThreadPool::GetInstance(); + std::vector>> futures; + AssertInfo(data_slices.size() == slice_sizes.size(), + "inconsistent size of data slices with slice sizes!"); + AssertInfo(data_slices.size() == slice_names.size(), + "inconsistent size of data slices with slice names!"); + + for (int64_t i = 0; i < data_slices.size(); ++i) { + futures.push_back(pool.Submit(EncodeAndUploadIndexSlice, + remote_chunk_manager, + const_cast(data_slices[i]), + slice_sizes[i], + index_meta, + field_meta, + slice_names[i])); + } + + std::map remote_paths_to_size; + for (auto& future : futures) { + auto res = future.get(); + remote_paths_to_size[res.first] = res.second; + } + + ReleaseArrowUnused(); + return remote_paths_to_size; +} + +int64_t +GetTotalNumRowsForFieldDatas(const std::vector& field_datas) { + int64_t count = 0; + for (auto& field_data : field_datas) { + count += field_data->get_num_rows(); + } + + return count; +} + +void +ReleaseArrowUnused() { + static std::mutex release_mutex; + + // While multiple threads are releasing memory, + // we don't need everyone do releasing, + // just let some of them do this also works well + if (release_mutex.try_lock()) { + arrow::default_memory_pool()->ReleaseUnused(); + release_mutex.unlock(); + } +} + +ChunkManagerPtr +CreateChunkManager(const StorageConfig& storage_config) { + auto storage_type = ChunkManagerType_Map[storage_config.storage_type]; + + switch (storage_type) { + case ChunkManagerType::Local: { + return std::make_shared( + storage_config.root_path); + } + case ChunkManagerType::Minio: { + return std::make_shared(storage_config); + } + default: { + PanicInfo("unsupported"); + } + } } FileManagerImplPtr CreateFileManager(IndexType index_type, const FieldDataMeta& field_meta, const IndexMeta& index_meta, - const StorageConfig& storage_config) { - // TODO :: switch case index type to create file manager -#ifdef BUILD_DISK_ANN + ChunkManagerPtr cm) { if (is_in_disk_list(index_type)) { return std::make_shared( - field_meta, index_meta, storage_config); + field_meta, index_meta, cm); } -#endif - return nullptr; + return std::make_shared(field_meta, index_meta, cm); +} + +FieldDataPtr +CreateFieldData(const DataType& type, int64_t dim, int64_t total_num_rows) { + switch (type) { + case DataType::BOOL: + return std::make_shared>(type, total_num_rows); + case DataType::INT8: + return std::make_shared>(type, total_num_rows); + case DataType::INT16: + return std::make_shared>(type, total_num_rows); + case DataType::INT32: + return std::make_shared>(type, total_num_rows); + case DataType::INT64: + return std::make_shared>(type, total_num_rows); + case DataType::FLOAT: + return std::make_shared>(type, total_num_rows); + case DataType::DOUBLE: + return std::make_shared>(type, total_num_rows); + case DataType::STRING: + case DataType::VARCHAR: + return std::make_shared>(type, + total_num_rows); + case DataType::JSON: + return std::make_shared>(type, total_num_rows); + case DataType::VECTOR_FLOAT: + return std::make_shared>( + dim, type, total_num_rows); + case DataType::VECTOR_BINARY: + return std::make_shared>( + dim, type, total_num_rows); + default: + throw NotSupportedDataTypeException( + "CreateFieldData not support data type " + datatype_name(type)); + } } } // namespace milvus::storage diff --git a/internal/core/src/storage/Util.h b/internal/core/src/storage/Util.h index 1c0cf16d47..4823023fe9 100644 --- a/internal/core/src/storage/Util.h +++ b/internal/core/src/storage/Util.h @@ -23,7 +23,10 @@ #include "storage/PayloadStream.h" #include "storage/FileManager.h" #include "storage/BinlogReader.h" +#include "storage/ChunkManager.h" +#include "storage/DataCodec.h" #include "knowhere/comp/index_param.h" +#include "parquet/schema.h" namespace milvus::storage { @@ -55,36 +58,73 @@ CreateArrowSchema(DataType data_type); std::shared_ptr CreateArrowSchema(DataType data_type, int dim); +int +GetDimensionFromFileMetaData(const parquet::ColumnDescriptor* schema, + DataType data_type); + int GetDimensionFromArrowArray(std::shared_ptr array, DataType data_type); std::string -GetLocalIndexPathPrefixWithBuildID(int64_t build_id); +GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id); std::string -GenLocalIndexPathPrefix(int64_t build_id, int64_t index_version); +GenIndexPathPrefix(ChunkManagerPtr cm, int64_t build_id, int64_t index_version); std::string -GenFieldRawDataPathPrefix(int64_t segment_id, int64_t field_id); +GenFieldRawDataPathPrefix(ChunkManagerPtr cm, + int64_t segment_id, + int64_t field_id); std::string -GetSegmentRawDataPathPrefix(int64_t segment_id); +GetSegmentRawDataPathPrefix(ChunkManagerPtr cm, int64_t segment_id); -template -inline bool -is_in_list(const T& t, std::function()> list_func) { - auto l = list_func(); - return std::find(l.begin(), l.end(), t) != l.end(); -} +std::unique_ptr +DownloadAndDecodeRemoteFile(ChunkManager* chunk_manager, + const std::string& file); -bool -is_in_disk_list(const IndexType& index_type); +std::pair +EncodeAndUploadIndexSlice(ChunkManager* chunk_manager, + uint8_t* buf, + int64_t batch_size, + IndexMeta index_meta, + FieldDataMeta field_meta, + std::string object_key); + +std::vector +GetObjectData(ChunkManager* remote_chunk_manager, + const std::vector& remote_files); + +std::map +PutIndexData(ChunkManager* remote_chunk_manager, + const std::vector& data_slices, + const std::vector& slice_sizes, + const std::vector& slice_names, + FieldDataMeta& field_meta, + IndexMeta& index_meta); + +int64_t +GetTotalNumRowsForFieldDatas(const std::vector& field_datas); + +void +ReleaseArrowUnused(); + +// size_t +// getCurrentRSS(); + +ChunkManagerPtr +CreateChunkManager(const StorageConfig& storage_config); FileManagerImplPtr CreateFileManager(IndexType index_type, const FieldDataMeta& field_meta, const IndexMeta& index_meta, - const StorageConfig& storage_config); + ChunkManagerPtr cm); + +FieldDataPtr +CreateFieldData(const DataType& type, + int64_t dim = 1, + int64_t total_num_rows = 0); } // namespace milvus::storage diff --git a/internal/core/src/storage/parquet_c.cpp b/internal/core/src/storage/parquet_c.cpp index 176f7995f9..f4935f2ad9 100644 --- a/internal/core/src/storage/parquet_c.cpp +++ b/internal/core/src/storage/parquet_c.cpp @@ -21,24 +21,12 @@ #include "storage/PayloadWriter.h" #include "storage/FieldData.h" #include "common/CGoHelper.h" +#include "storage/Util.h" using Payload = milvus::storage::Payload; using PayloadWriter = milvus::storage::PayloadWriter; using PayloadReader = milvus::storage::PayloadReader; -void -ReleaseArrowUnused() { - static std::mutex release_mutex; - - // While multiple threads are releasing memory, - // we don't need everyone do releasing, - // just let some of them do this also works well - if (release_mutex.try_lock()) { - arrow::default_memory_pool()->ReleaseUnused(); - release_mutex.unlock(); - } -} - extern "C" CPayloadWriter NewPayloadWriter(int columnType) { auto data_type = static_cast(columnType); @@ -227,7 +215,7 @@ ReleasePayloadWriter(CPayloadWriter handler) { auto p = reinterpret_cast(handler); if (p != nullptr) { delete p; - ReleaseArrowUnused(); + milvus::storage::ReleaseArrowUnused(); } } @@ -378,8 +366,9 @@ GetOneStringFromPayload(CPayloadReader payloadReader, try { auto p = reinterpret_cast(payloadReader); auto field_data = p->get_field_data(); - *cstr = (char*)(const_cast(field_data->RawValue(idx))); - *str_size = field_data->get_element_size(idx); + auto str = const_cast(field_data->RawValue(idx)); + *cstr = (char*)(*static_cast(str)).c_str(); + *str_size = field_data->Size(idx); return milvus::SuccessCStatus(); } catch (std::exception& e) { return milvus::FailureCStatus(UnexpectedError, e.what()); @@ -434,7 +423,8 @@ ReleasePayloadReader(CPayloadReader payloadReader) { "released payloadReader should not be null pointer"); auto p = reinterpret_cast(payloadReader); delete (p); - ReleaseArrowUnused(); + + milvus::storage::ReleaseArrowUnused(); return milvus::SuccessCStatus(); } catch (std::exception& e) { return milvus::FailureCStatus(UnexpectedError, e.what()); diff --git a/internal/core/src/storage/storage_c.cpp b/internal/core/src/storage/storage_c.cpp index 307d69d965..9fb2079ca8 100644 --- a/internal/core/src/storage/storage_c.cpp +++ b/internal/core/src/storage/storage_c.cpp @@ -15,28 +15,67 @@ // limitations under the License. #include "storage/storage_c.h" -#include "config/ConfigChunkManager.h" #include "common/CGoHelper.h" - -#ifdef BUILD_DISK_ANN -#include "storage/LocalChunkManager.h" -#endif +#include "storage/RemoteChunkManagerSingleton.h" +#include "storage/LocalChunkManagerSingleton.h" CStatus -GetLocalUsedSize(int64_t* size) { +GetLocalUsedSize(const char* c_dir, int64_t* size) { try { -#ifdef BUILD_DISK_ANN - auto& local_chunk_manager = - milvus::storage::LocalChunkManager::GetInstance(); - auto dir = milvus::ChunkMangerConfig::GetLocalRootPath(); - if (local_chunk_manager.DirExist(dir)) { - *size = local_chunk_manager.GetSizeOfDir(dir); + auto local_chunk_manager = + milvus::storage::LocalChunkManagerSingleton::GetInstance() + .GetChunkManager(); + std::string dir(c_dir); + if (local_chunk_manager->DirExist(dir)) { + *size = local_chunk_manager->GetSizeOfDir(dir); } else { *size = 0; } -#endif return milvus::SuccessCStatus(); } catch (std::exception& e) { return milvus::FailureCStatus(UnexpectedError, e.what()); } } + +CStatus +InitLocalChunkManagerSingleton(const char* c_path) { + try { + std::string path(c_path); + milvus::storage::LocalChunkManagerSingleton::GetInstance().Init(path); + + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(UnexpectedError, e.what()); + } +} + +CStatus +InitRemoteChunkManagerSingleton(CStorageConfig c_storage_config) { + try { + milvus::storage::StorageConfig storage_config; + storage_config.address = std::string(c_storage_config.address); + storage_config.bucket_name = std::string(c_storage_config.bucket_name); + storage_config.access_key_id = + std::string(c_storage_config.access_key_id); + storage_config.access_key_value = + std::string(c_storage_config.access_key_value); + storage_config.root_path = std::string(c_storage_config.root_path); + storage_config.storage_type = + std::string(c_storage_config.storage_type); + storage_config.iam_endpoint = + std::string(c_storage_config.iam_endpoint); + storage_config.useSSL = c_storage_config.useSSL; + storage_config.useIAM = c_storage_config.useIAM; + milvus::storage::RemoteChunkManagerSingleton::GetInstance().Init( + storage_config); + + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(UnexpectedError, e.what()); + } +} + +void +CleanRemoteChunkManagerSingleton() { + milvus::storage::RemoteChunkManagerSingleton::GetInstance().Release(); +} diff --git a/internal/core/src/storage/storage_c.h b/internal/core/src/storage/storage_c.h index 30d3dc7def..8418694ddf 100644 --- a/internal/core/src/storage/storage_c.h +++ b/internal/core/src/storage/storage_c.h @@ -22,7 +22,16 @@ extern "C" { #include "common/type_c.h" CStatus -GetLocalUsedSize(int64_t* size); +GetLocalUsedSize(const char* c_path, int64_t* size); + +CStatus +InitLocalChunkManagerSingleton(const char* path); + +CStatus +InitRemoteChunkManagerSingleton(CStorageConfig c_storage_config); + +void +CleanRemoteChunkManagerSingleton(); #ifdef __cplusplus }; diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index 13fd9f6a22..fdd635c2a9 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -49,14 +49,14 @@ set(MILVUS_TEST_FILES test_data_codec.cpp test_range_search_sort.cpp test_tracer.cpp + test_local_chunk_manager.cpp + test_disk_file_manager_test.cpp ) if ( BUILD_DISK_ANN STREQUAL "ON" ) set(MILVUS_TEST_FILES ${MILVUS_TEST_FILES} #test_minio_chunk_manager.cpp - #test_disk_file_manager_test.cpp - test_local_chunk_manager.cpp ) endif() diff --git a/internal/core/unittest/bench/bench_indexbuilder.cpp b/internal/core/unittest/bench/bench_indexbuilder.cpp index 86785d2b4b..e85c927cbf 100644 --- a/internal/core/unittest/bench/bench_indexbuilder.cpp +++ b/internal/core/unittest/bench/bench_indexbuilder.cpp @@ -50,14 +50,16 @@ IndexBuilder_build(benchmark::State& state) { std::tie(type_params, index_params) = generate_params(index_type, metric_type); - std::string type_params_str, index_params_str; - bool ok; - ok = google::protobuf::TextFormat::PrintToString(type_params, - &type_params_str); - assert(ok); - ok = google::protobuf::TextFormat::PrintToString(index_params, - &index_params_str); - assert(ok); + milvus::Config config; + for (auto i = 0; i < type_params.params_size(); ++i) { + const auto& param = type_params.params(i); + config[param.key()] = param.value(); + } + + for (auto i = 0; i < index_params.params_size(); ++i) { + const auto& param = index_params.params(i); + config[param.key()] = param.value(); + } auto is_binary = state.range(2); auto dataset = GenDataset(NB, metric_type, is_binary); @@ -66,10 +68,7 @@ IndexBuilder_build(benchmark::State& state) { for (auto _ : state) { auto index = std::make_unique( - milvus::DataType::VECTOR_FLOAT, - type_params_str.c_str(), - index_params_str.c_str(), - get_default_storage_config()); + milvus::DataType::VECTOR_FLOAT, config, nullptr); index->Build(xb_dataset); } } @@ -85,14 +84,16 @@ IndexBuilder_build_and_codec(benchmark::State& state) { std::tie(type_params, index_params) = generate_params(index_type, metric_type); - std::string type_params_str, index_params_str; - bool ok; - ok = google::protobuf::TextFormat::PrintToString(type_params, - &type_params_str); - assert(ok); - ok = google::protobuf::TextFormat::PrintToString(index_params, - &index_params_str); - assert(ok); + milvus::Config config; + for (auto i = 0; i < type_params.params_size(); ++i) { + const auto& param = type_params.params(i); + config[param.key()] = param.value(); + } + + for (auto i = 0; i < index_params.params_size(); ++i) { + const auto& param = index_params.params(i); + config[param.key()] = param.value(); + } auto is_binary = state.range(2); auto dataset = GenDataset(NB, metric_type, is_binary); @@ -101,10 +102,7 @@ IndexBuilder_build_and_codec(benchmark::State& state) { for (auto _ : state) { auto index = std::make_unique( - milvus::DataType::VECTOR_FLOAT, - type_params_str.c_str(), - index_params_str.c_str(), - get_default_storage_config()); + milvus::DataType::VECTOR_FLOAT, config, nullptr); index->Build(xb_dataset); index->Serialize(); diff --git a/internal/core/unittest/init_gtest.cpp b/internal/core/unittest/init_gtest.cpp index a97ae437d5..fb5483d0a7 100644 --- a/internal/core/unittest/init_gtest.cpp +++ b/internal/core/unittest/init_gtest.cpp @@ -11,10 +11,14 @@ #include #include "common/SystemProperty.h" +#include "test_utils/Constants.h" +#include "storage/LocalChunkManagerSingleton.h" int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + milvus::storage::LocalChunkManagerSingleton::GetInstance().Init( + TestLocalPath); return RUN_ALL_TESTS(); } diff --git a/internal/core/unittest/test_c_api.cpp b/internal/core/unittest/test_c_api.cpp old mode 100755 new mode 100644 index afa7f4a4e9..de4fb909b7 --- a/internal/core/unittest/test_c_api.cpp +++ b/internal/core/unittest/test_c_api.cpp @@ -46,7 +46,6 @@ namespace { // const int DIM = 16; const int64_t ROW_COUNT = 10 * 1000; const int64_t BIAS = 4200; -const CStorageConfig c_storage_config = get_default_cstorage_config(); const char* get_default_schema_config() { @@ -486,45 +485,9 @@ TEST(CApiTest, MultiDeleteSealedSegment) { int N = 10; auto dataset = DataGen(col->get_schema(), N); - // load field data - for (auto& [field_id, field_meta] : col->get_schema()->get_fields()) { - auto array = dataset.get_col(field_id); - auto data = serialize(array.get()); - - auto load_info = - CLoadFieldDataInfo{field_id.get(), data.data(), data.size(), N}; - - auto res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - auto count = GetRowCount(segment); - ASSERT_EQ(count, N); - } - - // load timestamps - FieldMeta ts_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - auto ts_array = - CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta); - auto ts_data = serialize(ts_array.get()); - auto load_info = CLoadFieldDataInfo{ - TimestampFieldID.get(), ts_data.data(), ts_data.size(), N}; - auto res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - auto count = GetRowCount(segment); - ASSERT_EQ(count, N); - - // load rowID - FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); - auto row_id_array = CreateScalarDataArrayFrom( - dataset.row_ids_.data(), N, row_id_field_meta); - auto row_id_data = serialize(row_id_array.get()); - load_info = CLoadFieldDataInfo{ - RowFieldID.get(), row_id_data.data(), row_id_data.size(), N}; - res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - count = GetRowCount(segment); - ASSERT_EQ(count, N); + auto segment_interface = reinterpret_cast(segment); + auto sealed_segment = dynamic_cast(segment_interface); + SealedLoadFieldData(dataset, *sealed_segment); // delete data pks = {1} std::vector delete_pks = {1}; @@ -558,7 +521,7 @@ TEST(CApiTest, MultiDeleteSealedSegment) { auto max_ts = dataset.timestamps_[N - 1] + 10; CRetrieveResult retrieve_result; - res = Retrieve(segment, plan.get(), {}, max_ts, &retrieve_result); + auto res = Retrieve(segment, plan.get(), {}, max_ts, &retrieve_result); ASSERT_EQ(res.error_code, Success); auto query_result = std::make_unique(); auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, @@ -715,42 +678,9 @@ TEST(CApiTest, DeleteRepeatedPksFromSealedSegment) { int N = 20; auto dataset = DataGen(col->get_schema(), N, 42, 0, 2); - for (auto& [field_id, field_meta] : col->get_schema()->get_fields()) { - auto array = dataset.get_col(field_id); - auto data = serialize(array.get()); - - auto load_info = - CLoadFieldDataInfo{field_id.get(), data.data(), data.size(), N}; - - auto res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - auto count = GetRowCount(segment); - ASSERT_EQ(count, N); - } - - FieldMeta ts_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - auto ts_array = - CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta); - auto ts_data = serialize(ts_array.get()); - auto load_info = CLoadFieldDataInfo{ - TimestampFieldID.get(), ts_data.data(), ts_data.size(), N}; - auto res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - auto count = GetRowCount(segment); - ASSERT_EQ(count, N); - - FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); - auto row_id_array = CreateScalarDataArrayFrom( - dataset.row_ids_.data(), N, row_id_field_meta); - auto row_id_data = serialize(row_id_array.get()); - load_info = CLoadFieldDataInfo{ - RowFieldID.get(), row_id_data.data(), row_id_data.size(), N}; - res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - count = GetRowCount(segment); - ASSERT_EQ(count, N); + auto segment_interface = reinterpret_cast(segment); + auto sealed_segment = dynamic_cast(segment_interface); + SealedLoadFieldData(dataset, *sealed_segment); // create retrieve plan pks in {1, 2, 3} std::vector retrive_row_ids = {1, 2, 3}; @@ -767,7 +697,7 @@ TEST(CApiTest, DeleteRepeatedPksFromSealedSegment) { plan->field_ids_ = target_field_ids; CRetrieveResult retrieve_result; - res = Retrieve( + auto res = Retrieve( segment, plan.get(), {}, dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); auto query_result = std::make_unique(); @@ -919,42 +849,9 @@ TEST(CApiTest, InsertSamePkAfterDeleteOnSealedSegment) { auto dataset = DataGen(col->get_schema(), N, 42, 0, 2); // insert data with pks = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4} , timestamps = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} - for (auto& [field_id, field_meta] : col->get_schema()->get_fields()) { - auto array = dataset.get_col(field_id); - auto data = serialize(array.get()); - - auto load_info = - CLoadFieldDataInfo{field_id.get(), data.data(), data.size(), N}; - - auto res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - auto count = GetRowCount(segment); - ASSERT_EQ(count, N); - } - - FieldMeta ts_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - auto ts_array = - CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta); - auto ts_data = serialize(ts_array.get()); - auto load_info = CLoadFieldDataInfo{ - TimestampFieldID.get(), ts_data.data(), ts_data.size(), N}; - auto res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - auto count = GetRowCount(segment); - ASSERT_EQ(count, N); - - FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); - auto row_id_array = CreateScalarDataArrayFrom( - dataset.row_ids_.data(), N, row_id_field_meta); - auto row_id_data = serialize(row_id_array.get()); - load_info = CLoadFieldDataInfo{ - RowFieldID.get(), row_id_data.data(), row_id_data.size(), N}; - res = LoadFieldData(segment, load_info); - ASSERT_EQ(res.error_code, Success); - count = GetRowCount(segment); - ASSERT_EQ(count, N); + auto segment_interface = reinterpret_cast(segment); + auto sealed_segment = dynamic_cast(segment_interface); + SealedLoadFieldData(dataset, *sealed_segment); // delete data pks = {1, 2, 3}, timestamps = {4, 4, 4} std::vector delete_row_ids = {1, 2, 3}; @@ -989,7 +886,7 @@ TEST(CApiTest, InsertSamePkAfterDeleteOnSealedSegment) { plan->field_ids_ = target_field_ids; CRetrieveResult retrieve_result; - res = Retrieve( + auto res = Retrieve( segment, plan.get(), {}, dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); auto query_result = std::make_unique(); @@ -1731,7 +1628,7 @@ TEST(CApiTest, LoadIndexInfo) { CBinarySet c_binary_set = (CBinarySet)&binary_set; void* c_load_index_info = nullptr; - auto status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + auto status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_param_key1 = "index_type"; std::string index_param_value1 = knowhere::IndexEnum::INDEX_FAISS_IVFSQ8; @@ -1903,7 +1800,7 @@ TEST(CApiTest, Indexing_Without_Predicate) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_IVFSQ8; @@ -2050,7 +1947,7 @@ TEST(CApiTest, Indexing_Expr_Without_Predicate) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_IVFSQ8; @@ -2213,7 +2110,7 @@ TEST(CApiTest, Indexing_With_float_Predicate_Range) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_IVFSQ8; @@ -2389,7 +2286,7 @@ TEST(CApiTest, Indexing_Expr_With_float_Predicate_Range) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_IVFSQ8; @@ -2549,7 +2446,7 @@ TEST(CApiTest, Indexing_With_float_Predicate_Term) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_IVFSQ8; @@ -2718,7 +2615,7 @@ TEST(CApiTest, Indexing_Expr_With_float_Predicate_Term) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_IVFSQ8; @@ -2879,7 +2776,7 @@ TEST(CApiTest, Indexing_With_binary_Predicate_Range) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_BIN_IVFFLAT; @@ -3054,7 +2951,7 @@ TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Range) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_BIN_IVFFLAT; @@ -3215,7 +3112,7 @@ TEST(CApiTest, Indexing_With_binary_Predicate_Term) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_BIN_IVFFLAT; @@ -3406,7 +3303,7 @@ TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Term) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_BIN_IVFFLAT; @@ -3477,21 +3374,13 @@ TEST(CApiTest, SealedSegmentTest) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Sealed, -1); - int N = 10000; + int N = 1000; std::default_random_engine e(67); auto ages = std::vector(N); for (auto& age : ages) { age = e() % 2000; } - auto blob = (void*)(&ages[0]); - FieldMeta field_meta(FieldName("age"), FieldId(101), DataType::INT64); - auto array = CreateScalarDataArrayFrom(ages.data(), N, field_meta); - auto age_data = serialize(array.get()); - - auto load_info = - CLoadFieldDataInfo{101, age_data.data(), age_data.size(), N}; - - auto res = LoadFieldData(segment, load_info); + auto res = LoadFieldRawData(segment, 101, ages.data(), N); ASSERT_EQ(res.error_code, Success); auto count = GetRowCount(segment); ASSERT_EQ(count, N); @@ -3515,23 +3404,6 @@ TEST(CApiTest, SealedSegment_search_float_Predicate_Range) { auto query_ptr = vec_col.data() + BIAS * DIM; auto counter_col = dataset.get_col(FieldId(101)); - FieldMeta counter_field_meta( - FieldName("counter"), FieldId(101), DataType::INT64); - auto count_array = - CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta); - auto counter_data = serialize(count_array.get()); - - FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); - auto row_ids_array = CreateScalarDataArrayFrom( - dataset.row_ids_.data(), N, row_id_field_meta); - auto row_ids_data = serialize(row_ids_array.get()); - - FieldMeta timestamp_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - auto timestamps_array = CreateScalarDataArrayFrom( - dataset.timestamps_.data(), N, timestamp_field_meta); - auto timestamps_data = serialize(timestamps_array.get()); const char* dsl_string = R"({ "bool": { @@ -3590,7 +3462,7 @@ TEST(CApiTest, SealedSegment_search_float_Predicate_Range) { N); auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_IVFSQ8; @@ -3616,31 +3488,13 @@ TEST(CApiTest, SealedSegment_search_float_Predicate_Range) { vec_index->Query(query_dataset, search_info, nullptr); EXPECT_EQ(result_on_index->distances_.size(), num_queries * TOPK); - auto c_counter_field_data = CLoadFieldDataInfo{ - 101, - counter_data.data(), - counter_data.size(), - N, - }; - status = LoadFieldData(segment, c_counter_field_data); + status = LoadFieldRawData(segment, 101, counter_col.data(), N); ASSERT_EQ(status.error_code, Success); - auto c_id_field_data = CLoadFieldDataInfo{ - 0, - row_ids_data.data(), - row_ids_data.size(), - N, - }; - status = LoadFieldData(segment, c_id_field_data); + status = LoadFieldRawData(segment, 0, dataset.row_ids_.data(), N); ASSERT_EQ(status.error_code, Success); - auto c_ts_field_data = CLoadFieldDataInfo{ - 1, - timestamps_data.data(), - timestamps_data.size(), - N, - }; - status = LoadFieldData(segment, c_ts_field_data); + status = LoadFieldRawData(segment, 1, dataset.timestamps_.data(), N); ASSERT_EQ(status.error_code, Success); // load index for vec field, load raw data for scalar field @@ -3689,23 +3543,6 @@ TEST(CApiTest, SealedSegment_search_without_predicates) { auto vec_data = serialize(vec_array.get()); auto counter_col = dataset.get_col(FieldId(101)); - FieldMeta counter_field_meta( - FieldName("counter"), FieldId(101), DataType::INT64); - auto count_array = - CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta); - auto counter_data = serialize(count_array.get()); - - FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); - auto row_ids_array = CreateScalarDataArrayFrom( - dataset.row_ids_.data(), N, row_id_field_meta); - auto row_ids_data = serialize(row_ids_array.get()); - - FieldMeta timestamp_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - auto timestamps_array = CreateScalarDataArrayFrom( - dataset.timestamps_.data(), N, timestamp_field_meta); - auto timestamps_data = serialize(timestamps_array.get()); const char* dsl_string = R"( { @@ -3724,40 +3561,16 @@ TEST(CApiTest, SealedSegment_search_without_predicates) { } })"; - auto c_vec_field_data = CLoadFieldDataInfo{ - 100, - vec_data.data(), - vec_data.size(), - N, - }; - auto status = LoadFieldData(segment, c_vec_field_data); + auto status = LoadFieldRawData(segment, 100, vec_data.data(), N); ASSERT_EQ(status.error_code, Success); - auto c_counter_field_data = CLoadFieldDataInfo{ - 101, - counter_data.data(), - counter_data.size(), - N, - }; - status = LoadFieldData(segment, c_counter_field_data); + status = LoadFieldRawData(segment, 101, counter_col.data(), N); ASSERT_EQ(status.error_code, Success); - auto c_id_field_data = CLoadFieldDataInfo{ - 0, - row_ids_data.data(), - row_ids_data.size(), - N, - }; - status = LoadFieldData(segment, c_id_field_data); + status = LoadFieldRawData(segment, 0, dataset.row_ids_.data(), N); ASSERT_EQ(status.error_code, Success); - auto c_ts_field_data = CLoadFieldDataInfo{ - 1, - timestamps_data.data(), - timestamps_data.size(), - N, - }; - status = LoadFieldData(segment, c_ts_field_data); + status = LoadFieldRawData(segment, 1, dataset.timestamps_.data(), N); ASSERT_EQ(status.error_code, Success); int num_queries = 10; @@ -3808,23 +3621,6 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) { auto query_ptr = vec_col.data() + BIAS * DIM; auto counter_col = dataset.get_col(FieldId(101)); - FieldMeta counter_field_meta( - FieldName("counter"), FieldId(101), DataType::INT64); - auto count_array = - CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta); - auto counter_data = serialize(count_array.get()); - - FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); - auto row_ids_array = CreateScalarDataArrayFrom( - dataset.row_ids_.data(), N, row_id_field_meta); - auto row_ids_data = serialize(row_ids_array.get()); - - FieldMeta timestamp_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - auto timestamps_array = CreateScalarDataArrayFrom( - dataset.timestamps_.data(), N, timestamp_field_meta); - auto timestamps_data = serialize(timestamps_array.get()); const char* serialized_expr_plan = R"(vector_anns: < field_id: 100 @@ -3898,7 +3694,7 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) { auto binary_set = indexing->Serialize(milvus::Config{}); void* c_load_index_info = nullptr; - status = NewLoadIndexInfo(&c_load_index_info, c_storage_config); + status = NewLoadIndexInfo(&c_load_index_info); ASSERT_EQ(status.error_code, Success); std::string index_type_key = "index_type"; std::string index_type_value = IndexEnum::INDEX_FAISS_IVFSQ8; @@ -3917,31 +3713,13 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) { ASSERT_EQ(status.error_code, Success); // load raw data - auto c_counter_field_data = CLoadFieldDataInfo{ - 101, - counter_data.data(), - counter_data.size(), - N, - }; - status = LoadFieldData(segment, c_counter_field_data); + status = LoadFieldRawData(segment, 101, counter_col.data(), N); ASSERT_EQ(status.error_code, Success); - auto c_id_field_data = CLoadFieldDataInfo{ - 0, - row_ids_data.data(), - row_ids_data.size(), - N, - }; - status = LoadFieldData(segment, c_id_field_data); + status = LoadFieldRawData(segment, 0, dataset.row_ids_.data(), N); ASSERT_EQ(status.error_code, Success); - auto c_ts_field_data = CLoadFieldDataInfo{ - 1, - timestamps_data.data(), - timestamps_data.size(), - N, - }; - status = LoadFieldData(segment, c_ts_field_data); + status = LoadFieldRawData(segment, 1, dataset.timestamps_.data(), N); ASSERT_EQ(status.error_code, Success); // gen query dataset @@ -3999,27 +3777,15 @@ TEST(CApiTest, RetriveScalarFieldFromSealedSegmentWithIndex) { LoadIndexInfo load_index_info; // load timestamp field - FieldMeta ts_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - auto ts_array = CreateScalarDataArrayFrom( - raw_data.timestamps_.data(), N, ts_field_meta); - auto ts_data = serialize(ts_array.get()); - auto load_info = CLoadFieldDataInfo{ - TimestampFieldID.get(), ts_data.data(), ts_data.size(), N}; - auto res = LoadFieldData(segment, load_info); + auto res = LoadFieldRawData( + segment, TimestampFieldID.get(), raw_data.timestamps_.data(), N); ASSERT_EQ(res.error_code, Success); auto count = GetRowCount(segment); ASSERT_EQ(count, N); // load rowid field - FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); - auto row_id_array = CreateScalarDataArrayFrom( - raw_data.row_ids_.data(), N, row_id_field_meta); - auto row_id_data = serialize(row_id_array.get()); - load_info = CLoadFieldDataInfo{ - RowFieldID.get(), row_id_data.data(), row_id_data.size(), N}; - res = LoadFieldData(segment, load_info); + res = LoadFieldRawData( + segment, RowFieldID.get(), raw_data.row_ids_.data(), N); ASSERT_EQ(res.error_code, Success); count = GetRowCount(segment); ASSERT_EQ(count, N); diff --git a/internal/core/unittest/test_data_codec.cpp b/internal/core/unittest/test_data_codec.cpp index 187bdad1b4..f8939e72db 100644 --- a/internal/core/unittest/test_data_codec.cpp +++ b/internal/core/unittest/test_data_codec.cpp @@ -19,7 +19,7 @@ #include "storage/DataCodec.h" #include "storage/InsertData.h" #include "storage/IndexData.h" -#include "storage/FieldDataFactory.h" +#include "storage/Util.h" #include "common/Consts.h" #include "utils/Json.h" @@ -27,9 +27,7 @@ using namespace milvus; TEST(storage, InsertDataBool) { FixedVector data = {true, false, true, false, true}; - auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::BOOL); + auto field_data = milvus::storage::CreateFieldData(storage::DataType::BOOL); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -55,9 +53,7 @@ TEST(storage, InsertDataBool) { TEST(storage, InsertDataInt8) { FixedVector data = {1, 2, 3, 4, 5}; - auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::INT8); + auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT8); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -84,8 +80,7 @@ TEST(storage, InsertDataInt8) { TEST(storage, InsertDataInt16) { FixedVector data = {1, 2, 3, 4, 5}; auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::INT16); + milvus::storage::CreateFieldData(storage::DataType::INT16); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -112,8 +107,7 @@ TEST(storage, InsertDataInt16) { TEST(storage, InsertDataInt32) { FixedVector data = {true, false, true, false, true}; auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::INT32); + milvus::storage::CreateFieldData(storage::DataType::INT32); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -140,8 +134,7 @@ TEST(storage, InsertDataInt32) { TEST(storage, InsertDataInt64) { FixedVector data = {1, 2, 3, 4, 5}; auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::INT64); + milvus::storage::CreateFieldData(storage::DataType::INT64); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -169,8 +162,7 @@ TEST(storage, InsertDataString) { FixedVector data = { "test1", "test2", "test3", "test4", "test5"}; auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::VARCHAR); + milvus::storage::CreateFieldData(storage::DataType::VARCHAR); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -191,8 +183,9 @@ TEST(storage, InsertDataString) { ASSERT_EQ(new_payload->get_num_rows(), data.size()); FixedVector new_data(data.size()); for (int i = 0; i < data.size(); ++i) { - new_data[i] = reinterpret_cast(new_payload->RawValue(i)); - ASSERT_EQ(new_payload->get_element_size(i), data[i].size()); + new_data[i] = + *static_cast(new_payload->RawValue(i)); + ASSERT_EQ(new_payload->Size(i), data[i].size()); } ASSERT_EQ(data, new_data); } @@ -200,8 +193,7 @@ TEST(storage, InsertDataString) { TEST(storage, InsertDataFloat) { FixedVector data = {1, 2, 3, 4, 5}; auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::FLOAT); + milvus::storage::CreateFieldData(storage::DataType::FLOAT); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -228,8 +220,7 @@ TEST(storage, InsertDataFloat) { TEST(storage, InsertDataDouble) { FixedVector data = {1.0, 2.0, 3.0, 4.2, 5.3}; auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::DOUBLE); + milvus::storage::CreateFieldData(storage::DataType::DOUBLE); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -257,9 +248,8 @@ TEST(storage, InsertDataFloatVector) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; int DIM = 2; auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::VECTOR_FLOAT, DIM); - field_data->FillFieldData(data.data(), data.size()); + milvus::storage::CreateFieldData(storage::DataType::VECTOR_FLOAT, DIM); + field_data->FillFieldData(data.data(), data.size() / DIM); storage::InsertData insert_data(field_data); storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; @@ -288,9 +278,8 @@ TEST(storage, InsertDataBinaryVector) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; int DIM = 16; auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::VECTOR_BINARY, DIM); - field_data->FillFieldData(data.data(), data.size()); + milvus::storage::CreateFieldData(storage::DataType::VECTOR_BINARY, DIM); + field_data->FillFieldData(data.data(), data.size() * 8 / DIM); storage::InsertData insert_data(field_data); storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; @@ -315,9 +304,7 @@ TEST(storage, InsertDataBinaryVector) { TEST(storage, IndexData) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; - auto field_data = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::INT8); + auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT8); field_data->FillFieldData(data.data(), data.size()); storage::IndexData index_data(field_data); diff --git a/internal/core/unittest/test_disk_file_manager_test.cpp b/internal/core/unittest/test_disk_file_manager_test.cpp index 216a615a89..a41e98ac73 100644 --- a/internal/core/unittest/test_disk_file_manager_test.cpp +++ b/internal/core/unittest/test_disk_file_manager_test.cpp @@ -16,13 +16,13 @@ #include #include "common/Slice.h" +#include "common/Common.h" #include "storage/Event.h" -#include "storage/LocalChunkManager.h" -#include "storage/MinioChunkManager.h" -#include "storage/DiskFileManagerImpl.h" #include "storage/ThreadPool.h" -#include "storage/FieldDataFactory.h" -#include "config/ConfigChunkManager.h" +#include "storage/Util.h" +#include "storage/DiskFileManagerImpl.h" +#include "storage/LocalChunkManagerSingleton.h" + #include "test_utils/indexbuilder_test_utils.h" using namespace std; @@ -40,104 +40,31 @@ class DiskAnnFileManagerTest : public testing::Test { virtual void SetUp() { - ChunkMangerConfig::SetLocalRootPath("/tmp/diskann"); - storage_config_ = get_default_storage_config(); + cm_ = storage::CreateChunkManager(get_default_storage_config()); } protected: - StorageConfig storage_config_; + ChunkManagerPtr cm_; }; -TEST_F(DiskAnnFileManagerTest, AddFilePositive) { - auto& lcm = LocalChunkManager::GetInstance(); - string testBucketName = "test-diskann"; - storage_config_.bucket_name = testBucketName; - auto rcm = std::make_unique(storage_config_); - if (!rcm->BucketExists(testBucketName)) { - rcm->CreateBucket(testBucketName); - } - - std::string indexFilePath = "/tmp/diskann/index_files/1000/index"; - auto exist = lcm.Exist(indexFilePath); - EXPECT_EQ(exist, false); - uint64_t index_size = 1024; - lcm.CreateFile(indexFilePath); - std::vector data(index_size); - lcm.Write(indexFilePath, data.data(), index_size); - - // collection_id: 1, partition_id: 2, segment_id: 3 - // field_id: 100, index_build_id: 1000, index_version: 1 - FieldDataMeta filed_data_meta = {1, 2, 3, 100}; - IndexMeta index_meta = {3, 100, 1000, 1, "index"}; - - int64_t slice_size = milvus::index_file_slice_size << 20; - auto diskAnnFileManager = std::make_shared( - filed_data_meta, index_meta, storage_config_); - auto ok = diskAnnFileManager->AddFile(indexFilePath); - EXPECT_EQ(ok, true); - - auto remote_files_to_size = diskAnnFileManager->GetRemotePathsToFileSize(); - auto num_slice = index_size / slice_size; - EXPECT_EQ(remote_files_to_size.size(), - index_size % slice_size == 0 ? num_slice : num_slice + 1); - - std::vector remote_files; - for (auto& file2size : remote_files_to_size) { - remote_files.emplace_back(file2size.first); - } - diskAnnFileManager->CacheIndexToDisk(remote_files); - auto local_files = diskAnnFileManager->GetLocalFilePaths(); - for (auto& file : local_files) { - auto file_size = lcm.Size(file); - auto buf = std::unique_ptr(new uint8_t[file_size]); - lcm.Read(file, buf.get(), file_size); - - auto index = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::INT8); - index->FillFieldData(buf.get(), file_size); - auto rows = index->get_num_rows(); - auto rawData = (uint8_t*)(index->Data()); - - EXPECT_EQ(rows, index_size); - EXPECT_EQ(rawData[0], data[0]); - EXPECT_EQ(rawData[4], data[4]); - } - - auto objects = - rcm->ListWithPrefix(diskAnnFileManager->GetRemoteIndexObjectPrefix()); - for (auto obj : objects) { - rcm->Remove(obj); - } - ok = rcm->DeleteBucket(testBucketName); - EXPECT_EQ(ok, true); -} - TEST_F(DiskAnnFileManagerTest, AddFilePositiveParallel) { - auto& lcm = LocalChunkManager::GetInstance(); - string testBucketName = "test-diskann"; - storage_config_.bucket_name = testBucketName; - auto rcm = std::make_unique(storage_config_); - if (!rcm->BucketExists(testBucketName)) { - rcm->CreateBucket(testBucketName); - } - + auto lcm = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); std::string indexFilePath = "/tmp/diskann/index_files/1000/index"; - auto exist = lcm.Exist(indexFilePath); + auto exist = lcm->Exist(indexFilePath); EXPECT_EQ(exist, false); uint64_t index_size = 50 << 20; - lcm.CreateFile(indexFilePath); + lcm->CreateFile(indexFilePath); std::vector data(index_size); - lcm.Write(indexFilePath, data.data(), index_size); + lcm->Write(indexFilePath, data.data(), index_size); // collection_id: 1, partition_id: 2, segment_id: 3 // field_id: 100, index_build_id: 1000, index_version: 1 FieldDataMeta filed_data_meta = {1, 2, 3, 100}; IndexMeta index_meta = {3, 100, 1000, 1, "index"}; - int64_t slice_size = milvus::index_file_slice_size << 20; - auto diskAnnFileManager = std::make_shared( - filed_data_meta, index_meta, storage_config_); + int64_t slice_size = milvus::FILE_SLICE_SIZE; + auto diskAnnFileManager = + std::make_shared(filed_data_meta, index_meta, cm_); auto ok = diskAnnFileManager->AddFile(indexFilePath); EXPECT_EQ(ok, true); @@ -154,13 +81,11 @@ TEST_F(DiskAnnFileManagerTest, AddFilePositiveParallel) { diskAnnFileManager->CacheIndexToDisk(remote_files); auto local_files = diskAnnFileManager->GetLocalFilePaths(); for (auto& file : local_files) { - auto file_size = lcm.Size(file); + auto file_size = lcm->Size(file); auto buf = std::unique_ptr(new uint8_t[file_size]); - lcm.Read(file, buf.get(), file_size); + lcm->Read(file, buf.get(), file_size); - auto index = - milvus::storage::FieldDataFactory::GetInstance().CreateFieldData( - storage::DataType::INT8); + auto index = milvus::storage::CreateFieldData(storage::DataType::INT8); index->FillFieldData(buf.get(), file_size); auto rows = index->get_num_rows(); auto rawData = (uint8_t*)(index->Data()); @@ -170,13 +95,9 @@ TEST_F(DiskAnnFileManagerTest, AddFilePositiveParallel) { EXPECT_EQ(rawData[4], data[4]); } - auto objects = - rcm->ListWithPrefix(diskAnnFileManager->GetRemoteIndexObjectPrefix()); - for (auto obj : objects) { - rcm->Remove(obj); + for (auto file : local_files) { + cm_->Remove(file); } - ok = rcm->DeleteBucket(testBucketName); - EXPECT_EQ(ok, true); } int diff --git a/internal/core/unittest/test_expr.cpp b/internal/core/unittest/test_expr.cpp index 4734417461..1b74714df6 100644 --- a/internal/core/unittest/test_expr.cpp +++ b/internal/core/unittest/test_expr.cpp @@ -1110,11 +1110,16 @@ TEST(Expr, TestCompareExpr) { auto seg = CreateSealedSegment(schema); int N = 1000; auto raw_data = DataGen(schema, N); - for (auto& [field_id, field_meta] : schema->get_fields()) { - auto array = raw_data.get_col(field_id); - auto data_info = - LoadFieldDataInfo{field_id.get(), array.get(), N, "/tmp/a"}; - seg->LoadFieldData(data_info); + auto fields = schema->get_fields(); + for (auto field_data : raw_data.raw_->fields_data()) { + int64_t field_id = field_data.field_id(); + + auto info = FieldDataInfo{field_data.field_id(), N, {}, "/tmp/a"}; + auto field_meta = fields.at(FieldId(field_id)); + info.datas.emplace_back( + CreateFieldDataFromDataArray(N, &field_data, field_meta)); + + seg->LoadFieldData(FieldId(field_id), info); } ExecExprVisitor visitor(*seg, seg->get_row_count(), MAX_TIMESTAMP); @@ -1253,13 +1258,16 @@ TEST(Expr, TestExprs) { auto raw_data = DataGen(schema, N); // load field data - for (auto& [field_id, field_meta] : schema->get_fields()) { - std::cout << field_id.get() << field_meta.get_name().get() << std::endl; - auto array = raw_data.get_col(field_id); + auto fields = schema->get_fields(); + for (auto field_data : raw_data.raw_->fields_data()) { + int64_t field_id = field_data.field_id(); - auto data_info = - LoadFieldDataInfo{field_id.get(), array.get(), N, "/tmp/a"}; - seg->LoadFieldData(data_info); + auto info = FieldDataInfo{field_data.field_id(), N, {}, "/tmp/a"}; + auto field_meta = fields.at(FieldId(field_id)); + info.datas.emplace_back( + CreateFieldDataFromDataArray(N, &field_data, field_meta)); + + seg->LoadFieldData(FieldId(field_id), info); } ExecExprVisitor visitor(*seg, seg->get_row_count(), MAX_TIMESTAMP); diff --git a/internal/core/unittest/test_index_c_api.cpp b/internal/core/unittest/test_index_c_api.cpp index 6befc5c5f9..579b54422a 100644 --- a/internal/core/unittest/test_index_c_api.cpp +++ b/internal/core/unittest/test_index_c_api.cpp @@ -47,11 +47,8 @@ TEST(FloatVecIndex, All) { CIndex copy_index; { - status = CreateIndex(dtype, - type_params_str.c_str(), - index_params_str.c_str(), - &index, - c_storage_config); + status = CreateIndex( + dtype, type_params_str.c_str(), index_params_str.c_str(), &index); ASSERT_EQ(Success, status.error_code); } { @@ -66,8 +63,7 @@ TEST(FloatVecIndex, All) { status = CreateIndex(dtype, type_params_str.c_str(), index_params_str.c_str(), - ©_index, - c_storage_config); + ©_index); ASSERT_EQ(Success, status.error_code); } { @@ -110,11 +106,8 @@ TEST(BinaryVecIndex, All) { CIndex copy_index; { - status = CreateIndex(dtype, - type_params_str.c_str(), - index_params_str.c_str(), - &index, - c_storage_config); + status = CreateIndex( + dtype, type_params_str.c_str(), index_params_str.c_str(), &index); ASSERT_EQ(Success, status.error_code); } { @@ -129,8 +122,7 @@ TEST(BinaryVecIndex, All) { status = CreateIndex(dtype, type_params_str.c_str(), index_params_str.c_str(), - ©_index, - c_storage_config); + ©_index); ASSERT_EQ(Success, status.error_code); } { @@ -174,8 +166,7 @@ TEST(CBoolIndexTest, All) { status = CreateIndex(dtype, type_params_str.c_str(), index_params_str.c_str(), - &index, - c_storage_config); + &index); ASSERT_EQ(Success, status.error_code); } { @@ -191,8 +182,7 @@ TEST(CBoolIndexTest, All) { status = CreateIndex(dtype, type_params_str.c_str(), index_params_str.c_str(), - ©_index, - c_storage_config); + ©_index); ASSERT_EQ(Success, status.error_code); } { @@ -234,8 +224,7 @@ TEST(CInt64IndexTest, All) { status = CreateIndex(dtype, type_params_str.c_str(), index_params_str.c_str(), - &index, - c_storage_config); + &index); ASSERT_EQ(Success, status.error_code); } { @@ -250,8 +239,7 @@ TEST(CInt64IndexTest, All) { status = CreateIndex(dtype, type_params_str.c_str(), index_params_str.c_str(), - ©_index, - c_storage_config); + ©_index); ASSERT_EQ(Success, status.error_code); } { @@ -295,8 +283,7 @@ TEST(CStringIndexTest, All) { status = CreateIndex(dtype, type_params_str.c_str(), index_params_str.c_str(), - &index, - c_storage_config); + &index); ASSERT_EQ(Success, status.error_code); } { @@ -312,8 +299,7 @@ TEST(CStringIndexTest, All) { status = CreateIndex(dtype, type_params_str.c_str(), index_params_str.c_str(), - ©_index, - c_storage_config); + ©_index); ASSERT_EQ(Success, status.error_code); } { diff --git a/internal/core/unittest/test_index_wrapper.cpp b/internal/core/unittest/test_index_wrapper.cpp index 86ed06447e..8e33b0b1da 100644 --- a/internal/core/unittest/test_index_wrapper.cpp +++ b/internal/core/unittest/test_index_wrapper.cpp @@ -36,6 +36,17 @@ class IndexWrapperTest : public ::testing::TestWithParam { metric_type = param.second; std::tie(type_params, index_params) = generate_params(index_type, metric_type); + + for (auto i = 0; i < type_params.params_size(); ++i) { + const auto& p = type_params.params(i); + config[p.key()] = p.value(); + } + + for (auto i = 0; i < index_params.params_size(); ++i) { + const auto& p = index_params.params(i); + config[p.key()] = p.value(); + } + bool ok; ok = google::protobuf::TextFormat::PrintToString(type_params, &type_params_str); @@ -58,9 +69,10 @@ class IndexWrapperTest : public ::testing::TestWithParam { is_binary = is_binary_map[index_type]; if (is_binary) { - vec_field_data_type = CDataType::FloatVector; + vec_field_data_type = DataType::VECTOR_FLOAT; + ; } else { - vec_field_data_type = CDataType::BinaryVector; + vec_field_data_type = DataType::VECTOR_BINARY; } auto dataset = GenDataset(NB, metric_type, is_binary); @@ -86,9 +98,10 @@ class IndexWrapperTest : public ::testing::TestWithParam { indexcgo::TypeParams type_params; indexcgo::IndexParams index_params; std::string type_params_str, index_params_str; + Config config; milvus::Config search_conf; bool is_binary; - CDataType vec_field_data_type; + DataType vec_field_data_type; knowhere::DataSetPtr xb_dataset; std::vector xb_data; std::vector xb_bin_data; @@ -118,10 +131,7 @@ INSTANTIATE_TEST_CASE_P( TEST_P(IndexWrapperTest, BuildAndQuery) { auto index = milvus::indexbuilder::IndexFactory::GetInstance().CreateIndex( - vec_field_data_type, - type_params_str.c_str(), - index_params_str.c_str(), - storage_config_); + vec_field_data_type, config, nullptr); auto dataset = GenDataset(NB, metric_type, is_binary); knowhere::DataSetPtr xb_dataset; @@ -139,10 +149,7 @@ TEST_P(IndexWrapperTest, BuildAndQuery) { auto binary_set = index->Serialize(); auto copy_index = milvus::indexbuilder::IndexFactory::GetInstance().CreateIndex( - vec_field_data_type, - type_params_str.c_str(), - index_params_str.c_str(), - storage_config_); + vec_field_data_type, config, nullptr); auto vec_index = static_cast(copy_index.get()); ASSERT_EQ(vec_index->dim(), DIM); diff --git a/internal/core/unittest/test_indexing.cpp b/internal/core/unittest/test_indexing.cpp index 5b36cdc936..269b49a6a9 100644 --- a/internal/core/unittest/test_indexing.cpp +++ b/internal/core/unittest/test_indexing.cpp @@ -23,14 +23,9 @@ #include "test_utils/indexbuilder_test_utils.h" #include "test_utils/DataGen.h" #include "test_utils/Timer.h" - -#ifdef BUILD_DISK_ANN -#include "storage/MinioChunkManager.h" -#include "storage/DiskFileManagerImpl.h" +#include "storage/Util.h" using namespace boost::filesystem; -#endif - using namespace milvus; using namespace milvus::segcore; @@ -291,10 +286,6 @@ class IndexTest : public ::testing::TestWithParam { void SetUp() override { storage_config_ = get_default_storage_config(); - // auto rcm = std::make_shared(storage_config_); - // if (!rcm->BucketExists(storage_config_.bucket_name)) { - // rcm->CreateBucket(storage_config_.bucket_name); - // } auto param = GetParam(); index_type = param.first; @@ -373,12 +364,10 @@ INSTANTIATE_TEST_CASE_P( knowhere::metric::TANIMOTO), std::pair(knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP, knowhere::metric::JACCARD), - std::pair(knowhere::IndexEnum::INDEX_HNSW, knowhere::metric::L2) - // ci ut not start minio, so not run ut about diskann index for now - // #ifdef BUILD_DISK_ANN - // std::pair(knowhere::IndexEnum::INDEX_DISKANN, knowhere::metric::L2), - // #endif - )); +#ifdef BUILD_DISK_ANN + std::pair(knowhere::IndexEnum::INDEX_DISKANN, knowhere::metric::L2), +#endif + std::pair(knowhere::IndexEnum::INDEX_HNSW, knowhere::metric::L2))); TEST_P(IndexTest, BuildAndQuery) { milvus::index::CreateIndexInfo create_index_info; @@ -386,37 +375,26 @@ TEST_P(IndexTest, BuildAndQuery) { create_index_info.metric_type = metric_type; create_index_info.field_type = vec_field_data_type; index::IndexBasePtr index; - if (index_type == knowhere::IndexEnum::INDEX_DISKANN) { -#ifdef BUILD_DISK_ANN - milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100}; - milvus::storage::IndexMeta index_meta{3, 100, 1000, 1}; - auto file_manager = - std::make_shared( - field_data_meta, index_meta, storage_config_); - index = milvus::index::IndexFactory::GetInstance().CreateIndex( - create_index_info, file_manager); -#endif - } else { - index = milvus::index::IndexFactory::GetInstance().CreateIndex( - create_index_info, nullptr); - } + + milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100}; + milvus::storage::IndexMeta index_meta{3, 100, 1000, 1}; + auto chunk_manager = milvus::storage::CreateChunkManager(storage_config_); + auto file_manager = milvus::storage::CreateFileManager( + index_type, field_data_meta, index_meta, chunk_manager); + index = milvus::index::IndexFactory::GetInstance().CreateIndex( + create_index_info, file_manager); + ASSERT_NO_THROW(index->BuildWithDataset(xb_dataset, build_conf)); milvus::index::IndexBasePtr new_index; milvus::index::VectorIndex* vec_index = nullptr; if (index_type == knowhere::IndexEnum::INDEX_DISKANN) { -#ifdef BUILD_DISK_ANN // TODO ::diskann.query need load first, ugly auto binary_set = index->Serialize(milvus::Config{}); index.reset(); - milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100}; - milvus::storage::IndexMeta index_meta{3, 100, 1000, 1}; - auto file_manager = - std::make_shared( - field_data_meta, index_meta, storage_config_); + new_index = milvus::index::IndexFactory::GetInstance().CreateIndex( create_index_info, file_manager); - vec_index = dynamic_cast(new_index.get()); std::vector index_files; @@ -426,7 +404,6 @@ TEST_P(IndexTest, BuildAndQuery) { load_conf["index_files"] = index_files; ASSERT_NO_THROW(vec_index->Load(binary_set, load_conf)); EXPECT_EQ(vec_index->Count(), NB); -#endif } else { vec_index = dynamic_cast(index.get()); } @@ -456,34 +433,23 @@ TEST_P(IndexTest, GetVector) { create_index_info.field_type = vec_field_data_type; index::IndexBasePtr index; - if (index_type == knowhere::IndexEnum::INDEX_DISKANN) { -#ifdef BUILD_DISK_ANN - milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100}; - milvus::storage::IndexMeta index_meta{3, 100, 1000, 1}; - auto file_manager = - std::make_shared( - field_data_meta, index_meta, storage_config_); - index = milvus::index::IndexFactory::GetInstance().CreateIndex( - create_index_info, file_manager); -#endif - } else { - index = milvus::index::IndexFactory::GetInstance().CreateIndex( - create_index_info, nullptr); - } + milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100}; + milvus::storage::IndexMeta index_meta{3, 100, 1000, 1}; + auto chunk_manager = milvus::storage::CreateChunkManager(storage_config_); + auto file_manager = milvus::storage::CreateFileManager( + index_type, field_data_meta, index_meta, chunk_manager); + index = milvus::index::IndexFactory::GetInstance().CreateIndex( + create_index_info, file_manager); + ASSERT_NO_THROW(index->BuildWithDataset(xb_dataset, build_conf)); milvus::index::IndexBasePtr new_index; milvus::index::VectorIndex* vec_index = nullptr; if (index_type == knowhere::IndexEnum::INDEX_DISKANN) { -#ifdef BUILD_DISK_ANN // TODO ::diskann.query need load first, ugly auto binary_set = index->Serialize(milvus::Config{}); index.reset(); - milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100}; - milvus::storage::IndexMeta index_meta{3, 100, 1000, 1}; - auto file_manager = - std::make_shared( - field_data_meta, index_meta, storage_config_); + new_index = milvus::index::IndexFactory::GetInstance().CreateIndex( create_index_info, file_manager); @@ -496,7 +462,6 @@ TEST_P(IndexTest, GetVector) { load_conf["index_files"] = index_files; vec_index->Load(binary_set, load_conf); EXPECT_EQ(vec_index->Count(), NB); -#endif } else { vec_index = dynamic_cast(index.get()); } @@ -534,73 +499,81 @@ TEST_P(IndexTest, GetVector) { } } -// #ifdef BUILD_DISK_ANN -// TEST(Indexing, SearchDiskAnnWithInvalidParam) { -// int64_t NB = 10000; -// IndexType index_type = knowhere::IndexEnum::INDEX_DISKANN; -// MetricType metric_type = knowhere::metric::L2; -// milvus::index::CreateIndexInfo create_index_info; -// create_index_info.index_type = index_type; -// create_index_info.metric_type = metric_type; -// create_index_info.field_type = milvus::DataType::VECTOR_FLOAT; -// -// StorageConfig storage_config = get_default_storage_config(); -// auto rcm = std::make_shared(storage_config); -// if (!rcm->BucketExists(storage_config.bucket_name)) { -// rcm->CreateBucket(storage_config.bucket_name); -// } -// milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100}; -// milvus::storage::IndexMeta index_meta{3, 100, 1000, 1}; -// auto file_manager = -// std::make_shared(field_data_meta, index_meta, storage_config); -// auto index = milvus::index::IndexFactory::GetInstance().CreateIndex(create_index_info, file_manager); -// -// auto build_conf = knowhere::Config{ -// {knowhere::meta::METRIC_TYPE, metric_type}, -// {knowhere::meta::DIM, std::to_string(DIM)}, -// {milvus::index::DISK_ANN_MAX_DEGREE, std::to_string(48)}, -// {milvus::index::DISK_ANN_SEARCH_LIST_SIZE, std::to_string(128)}, -// {milvus::index::DISK_ANN_PQ_CODE_BUDGET, std::to_string(0.001)}, -// {milvus::index::DISK_ANN_BUILD_DRAM_BUDGET, std::to_string(2)}, -// }; -// -// // build disk ann index -// auto dataset = GenDataset(NB, metric_type, false); -// std::vector xb_data = dataset.get_col(milvus::FieldId(100)); -// knowhere::DatasetPtr xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data()); -// ASSERT_NO_THROW(index->BuildWithDataset(xb_dataset, build_conf)); -// -// // serialize and load disk index, disk index can only be search after loading for now -// auto binary_set = index->Serialize(milvus::Config{}); -// index.reset(); -// // clean local file dir -// file_manager.reset(); -// -// auto new_file_manager = -// std::make_shared(field_data_meta, index_meta, storage_config); -// auto new_index = milvus::index::IndexFactory::GetInstance().CreateIndex(create_index_info, new_file_manager); -// auto vec_index = dynamic_cast(new_index.get()); -// std::vector index_files; -// for (auto& binary : binary_set.binary_map_) { -// index_files.emplace_back(binary.first); -// } -// auto load_conf = generate_load_conf(index_type, metric_type, NB); -// load_conf["index_files"] = index_files; -// vec_index->Load(binary_set, load_conf); -// EXPECT_EQ(vec_index->Count(), NB); -// -// // search disk index with search_list == limit -// int query_offset = 100; -// knowhere::DatasetPtr xq_dataset = knowhere::GenDataset(NQ, DIM, xb_data.data() + DIM * query_offset); -// -// milvus::SearchInfo search_info; -// search_info.topk_ = K; -// search_info.metric_type_ = metric_type; -// search_info.search_params_ = milvus::Config{ -// {knowhere::meta::METRIC_TYPE, metric_type}, -// {milvus::index::DISK_ANN_QUERY_LIST, K - 1}, -// }; -// EXPECT_THROW(vec_index->Query(xq_dataset, search_info, nullptr), std::runtime_error); -// // vec_index->Query(xq_dataset, search_info, nullptr); -// } -// #endif +#ifdef BUILD_DISK_ANN +TEST(Indexing, SearchDiskAnnWithInvalidParam) { + int64_t NB = 10000; + IndexType index_type = knowhere::IndexEnum::INDEX_DISKANN; + MetricType metric_type = knowhere::metric::L2; + milvus::index::CreateIndexInfo create_index_info; + create_index_info.index_type = index_type; + create_index_info.metric_type = metric_type; + create_index_info.field_type = milvus::DataType::VECTOR_FLOAT; + + int64_t collection_id = 1; + int64_t partition_id = 2; + int64_t segment_id = 3; + int64_t field_id = 100; + int64_t build_id = 1000; + int64_t index_version = 1; + + StorageConfig storage_config = get_default_storage_config(); + milvus::storage::FieldDataMeta field_data_meta{ + collection_id, partition_id, segment_id, field_id}; + milvus::storage::IndexMeta index_meta{ + segment_id, field_id, build_id, index_version}; + auto chunk_manager = storage::CreateChunkManager(storage_config); + auto file_manager = milvus::storage::CreateFileManager( + index_type, field_data_meta, index_meta, chunk_manager); + auto index = milvus::index::IndexFactory::GetInstance().CreateIndex( + create_index_info, file_manager); + + auto build_conf = Config{ + {knowhere::meta::METRIC_TYPE, metric_type}, + {knowhere::meta::DIM, std::to_string(DIM)}, + {milvus::index::DISK_ANN_MAX_DEGREE, std::to_string(48)}, + {milvus::index::DISK_ANN_SEARCH_LIST_SIZE, std::to_string(128)}, + {milvus::index::DISK_ANN_PQ_CODE_BUDGET, std::to_string(0.001)}, + {milvus::index::DISK_ANN_BUILD_DRAM_BUDGET, std::to_string(2)}, + {milvus::index::DISK_ANN_BUILD_THREAD_NUM, std::to_string(2)}, + }; + + // build disk ann index + auto dataset = GenDataset(NB, metric_type, false); + std::vector xb_data = + dataset.get_col(milvus::FieldId(field_id)); + knowhere::DataSetPtr xb_dataset = + knowhere::GenDataSet(NB, DIM, xb_data.data()); + ASSERT_NO_THROW(index->BuildWithDataset(xb_dataset, build_conf)); + + // serialize and load disk index, disk index can only be search after loading for now + auto binary_set = index->Upload(); + index.reset(); + + auto new_index = milvus::index::IndexFactory::GetInstance().CreateIndex( + create_index_info, file_manager); + auto vec_index = dynamic_cast(new_index.get()); + std::vector index_files; + for (auto& binary : binary_set.binary_map_) { + index_files.emplace_back(binary.first); + } + auto load_conf = generate_load_conf(index_type, metric_type, NB); + load_conf["index_files"] = index_files; + vec_index->Load(load_conf); + EXPECT_EQ(vec_index->Count(), NB); + + // search disk index with search_list == limit + int query_offset = 100; + knowhere::DataSetPtr xq_dataset = + knowhere::GenDataSet(NQ, DIM, xb_data.data() + DIM * query_offset); + + milvus::SearchInfo search_info; + search_info.topk_ = K; + search_info.metric_type_ = metric_type; + search_info.search_params_ = milvus::Config{ + {knowhere::meta::METRIC_TYPE, metric_type}, + {milvus::index::DISK_ANN_QUERY_LIST, K - 1}, + }; + EXPECT_THROW(vec_index->Query(xq_dataset, search_info, nullptr), + std::runtime_error); +} +#endif diff --git a/internal/core/unittest/test_local_chunk_manager.cpp b/internal/core/unittest/test_local_chunk_manager.cpp index b5219f09ef..e2df1a9663 100644 --- a/internal/core/unittest/test_local_chunk_manager.cpp +++ b/internal/core/unittest/test_local_chunk_manager.cpp @@ -16,7 +16,7 @@ #include #include -#include "storage/LocalChunkManager.h" +#include "storage/LocalChunkManagerSingleton.h" using namespace std; using namespace milvus; @@ -25,54 +25,54 @@ using namespace milvus::storage; class LocalChunkManagerTest : public testing::Test {}; TEST_F(LocalChunkManagerTest, DirPositive) { - auto& lcm = LocalChunkManager::GetInstance(); - string test_dir = lcm.GetPathPrefix() + "/local-test-dir/"; - lcm.RemoveDir(test_dir); - lcm.CreateDir(test_dir); + auto lcm = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + string test_dir = lcm->GetRootPath() + "/local-test-dir/"; + lcm->RemoveDir(test_dir); + lcm->CreateDir(test_dir); - bool exist = lcm.DirExist(test_dir); + bool exist = lcm->DirExist(test_dir); EXPECT_EQ(exist, true); - lcm.RemoveDir(test_dir); - exist = lcm.DirExist(test_dir); + lcm->RemoveDir(test_dir); + exist = lcm->DirExist(test_dir); EXPECT_EQ(exist, false); } TEST_F(LocalChunkManagerTest, FilePositive) { - auto& lcm = LocalChunkManager::GetInstance(); - string test_dir = lcm.GetPathPrefix() + "/local-test-dir"; + auto lcm = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + string test_dir = lcm->GetRootPath() + "/local-test-dir"; string file = test_dir + "/test-file"; - auto exist = lcm.Exist(file); + auto exist = lcm->Exist(file); EXPECT_EQ(exist, false); - lcm.CreateFile(file); - exist = lcm.Exist(file); + lcm->CreateFile(file); + exist = lcm->Exist(file); EXPECT_EQ(exist, true); - lcm.Remove(file); - exist = lcm.Exist(file); + lcm->Remove(file); + exist = lcm->Exist(file); EXPECT_EQ(exist, false); - lcm.RemoveDir(test_dir); - exist = lcm.DirExist(test_dir); + lcm->RemoveDir(test_dir); + exist = lcm->DirExist(test_dir); EXPECT_EQ(exist, false); } TEST_F(LocalChunkManagerTest, WritePositive) { - auto& lcm = LocalChunkManager::GetInstance(); - string test_dir = lcm.GetPathPrefix() + "/local-test-dir"; + auto lcm = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + string test_dir = lcm->GetRootPath() + "/local-test-dir"; string file = test_dir + "/test-write-positive"; - auto exist = lcm.Exist(file); + auto exist = lcm->Exist(file); EXPECT_EQ(exist, false); - lcm.CreateFile(file); + lcm->CreateFile(file); uint8_t data[5] = {0x17, 0x32, 0x45, 0x34, 0x23}; - lcm.Write(file, data, sizeof(data)); + lcm->Write(file, data, sizeof(data)); - exist = lcm.Exist(file); + exist = lcm->Exist(file); EXPECT_EQ(exist, true); - auto size = lcm.Size(file); + auto size = lcm->Size(file); EXPECT_EQ(size, 5); int datasize = 10000; @@ -81,31 +81,31 @@ TEST_F(LocalChunkManagerTest, WritePositive) { for (int i = 0; i < datasize; ++i) { bigdata[i] = rand() % 256; } - lcm.Write(file, bigdata, datasize); - size = lcm.Size(file); + lcm->Write(file, bigdata, datasize); + size = lcm->Size(file); EXPECT_EQ(size, datasize); delete[] bigdata; - lcm.RemoveDir(test_dir); - exist = lcm.DirExist(test_dir); + lcm->RemoveDir(test_dir); + exist = lcm->DirExist(test_dir); EXPECT_EQ(exist, false); } TEST_F(LocalChunkManagerTest, ReadPositive) { - auto& lcm = LocalChunkManager::GetInstance(); - string test_dir = lcm.GetPathPrefix() + "/local-test-dir"; + auto lcm = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + string test_dir = lcm->GetRootPath() + "/local-test-dir"; uint8_t data[5] = {0x17, 0x32, 0x45, 0x34, 0x23}; string path = test_dir + "/test-read-positive"; - lcm.CreateFile(path); - lcm.Write(path, data, sizeof(data)); - bool exist = lcm.Exist(path); + lcm->CreateFile(path); + lcm->Write(path, data, sizeof(data)); + bool exist = lcm->Exist(path); EXPECT_EQ(exist, true); - auto size = lcm.Size(path); + auto size = lcm->Size(path); EXPECT_EQ(size, 5); uint8_t readdata[20] = {0}; - size = lcm.Read(path, readdata, 20); + size = lcm->Read(path, readdata, 20); EXPECT_EQ(size, 5); EXPECT_EQ(readdata[0], 0x17); EXPECT_EQ(readdata[1], 0x32); @@ -113,19 +113,19 @@ TEST_F(LocalChunkManagerTest, ReadPositive) { EXPECT_EQ(readdata[3], 0x34); EXPECT_EQ(readdata[4], 0x23); - size = lcm.Read(path, readdata, 3); + size = lcm->Read(path, readdata, 3); EXPECT_EQ(size, 3); EXPECT_EQ(readdata[0], 0x17); EXPECT_EQ(readdata[1], 0x32); EXPECT_EQ(readdata[2], 0x45); uint8_t dataWithNULL[] = {0x17, 0x32, 0x00, 0x34, 0x23}; - lcm.Write(path, dataWithNULL, sizeof(dataWithNULL)); - exist = lcm.Exist(path); + lcm->Write(path, dataWithNULL, sizeof(dataWithNULL)); + exist = lcm->Exist(path); EXPECT_EQ(exist, true); - size = lcm.Size(path); + size = lcm->Size(path); EXPECT_EQ(size, 5); - size = lcm.Read(path, readdata, 20); + size = lcm->Read(path, readdata, 20); EXPECT_EQ(size, 5); EXPECT_EQ(readdata[0], 0x17); EXPECT_EQ(readdata[1], 0x32); @@ -133,38 +133,38 @@ TEST_F(LocalChunkManagerTest, ReadPositive) { EXPECT_EQ(readdata[3], 0x34); EXPECT_EQ(readdata[4], 0x23); - lcm.RemoveDir(test_dir); - exist = lcm.DirExist(test_dir); + lcm->RemoveDir(test_dir); + exist = lcm->DirExist(test_dir); EXPECT_EQ(exist, false); } TEST_F(LocalChunkManagerTest, WriteOffset) { - auto& lcm = LocalChunkManager::GetInstance(); - string test_dir = lcm.GetPathPrefix() + "/local-test-dir"; + auto lcm = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + string test_dir = lcm->GetRootPath() + "/local-test-dir"; string file = test_dir + "/test-write-offset"; - auto exist = lcm.Exist(file); + auto exist = lcm->Exist(file); EXPECT_EQ(exist, false); - lcm.CreateFile(file); - exist = lcm.Exist(file); + lcm->CreateFile(file); + exist = lcm->Exist(file); EXPECT_EQ(exist, true); int offset = 0; uint8_t data[5] = {0x17, 0x32, 0x00, 0x34, 0x23}; - lcm.Write(file, offset, data, sizeof(data)); + lcm->Write(file, offset, data, sizeof(data)); - exist = lcm.Exist(file); + exist = lcm->Exist(file); EXPECT_EQ(exist, true); - auto size = lcm.Size(file); + auto size = lcm->Size(file); EXPECT_EQ(size, 5); offset = 5; - lcm.Write(file, offset, data, sizeof(data)); - size = lcm.Size(file); + lcm->Write(file, offset, data, sizeof(data)); + size = lcm->Size(file); EXPECT_EQ(size, 10); uint8_t read_data[20] = {0}; - size = lcm.Read(file, read_data, 20); + size = lcm->Read(file, read_data, 20); EXPECT_EQ(size, 10); EXPECT_EQ(read_data[0], 0x17); EXPECT_EQ(read_data[1], 0x32); @@ -177,76 +177,76 @@ TEST_F(LocalChunkManagerTest, WriteOffset) { EXPECT_EQ(read_data[8], 0x34); EXPECT_EQ(read_data[9], 0x23); - lcm.RemoveDir(test_dir); - exist = lcm.DirExist(test_dir); + lcm->RemoveDir(test_dir); + exist = lcm->DirExist(test_dir); EXPECT_EQ(exist, false); } TEST_F(LocalChunkManagerTest, ReadOffset) { - auto& lcm = LocalChunkManager::GetInstance(); - string test_dir = lcm.GetPathPrefix() + "/local-test-dir"; + auto lcm = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + string test_dir = lcm->GetRootPath() + "/local-test-dir"; string file = test_dir + "/test-read-offset"; - lcm.CreateFile(file); - auto exist = lcm.Exist(file); + lcm->CreateFile(file); + auto exist = lcm->Exist(file); EXPECT_EQ(exist, true); uint8_t data[] = {0x17, 0x32, 0x00, 0x34, 0x23, 0x23, 0x87, 0x98}; - lcm.Write(file, data, sizeof(data)); + lcm->Write(file, data, sizeof(data)); - exist = lcm.Exist(file); + exist = lcm->Exist(file); EXPECT_EQ(exist, true); uint8_t read_data[20]; - auto size = lcm.Read(file, 0, read_data, 3); + auto size = lcm->Read(file, 0, read_data, 3); EXPECT_EQ(size, 3); EXPECT_EQ(read_data[0], 0x17); EXPECT_EQ(read_data[1], 0x32); EXPECT_EQ(read_data[2], 0x00); - size = lcm.Read(file, 3, read_data, 4); + size = lcm->Read(file, 3, read_data, 4); EXPECT_EQ(size, 4); EXPECT_EQ(read_data[0], 0x34); EXPECT_EQ(read_data[1], 0x23); EXPECT_EQ(read_data[2], 0x23); EXPECT_EQ(read_data[3], 0x87); - size = lcm.Read(file, 7, read_data, 4); + size = lcm->Read(file, 7, read_data, 4); EXPECT_EQ(size, 1); EXPECT_EQ(read_data[0], 0x98); - lcm.RemoveDir(test_dir); - exist = lcm.DirExist(test_dir); + lcm->RemoveDir(test_dir); + exist = lcm->DirExist(test_dir); EXPECT_EQ(exist, false); } TEST_F(LocalChunkManagerTest, GetSizeOfDir) { - auto& lcm = LocalChunkManager::GetInstance(); - auto test_dir = lcm.GetPathPrefix() + "/local-test-dir"; - EXPECT_EQ(lcm.DirExist(test_dir), false); - lcm.CreateDir(test_dir); - EXPECT_EQ(lcm.DirExist(test_dir), true); - EXPECT_EQ(lcm.GetSizeOfDir(test_dir), 0); + auto lcm = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + auto test_dir = lcm->GetRootPath() + "/local-test-dir"; + EXPECT_EQ(lcm->DirExist(test_dir), false); + lcm->CreateDir(test_dir); + EXPECT_EQ(lcm->DirExist(test_dir), true); + EXPECT_EQ(lcm->GetSizeOfDir(test_dir), 0); uint8_t data[] = {0x17, 0x32, 0x00, 0x34, 0x23, 0x23, 0x87, 0x98}; // test get size of file in test_dir auto file1 = test_dir + "/file"; - auto res = lcm.CreateFile(file1); + auto res = lcm->CreateFile(file1); EXPECT_EQ(res, true); - lcm.Write(file1, data, sizeof(data)); - EXPECT_EQ(lcm.GetSizeOfDir(test_dir), sizeof(data)); - lcm.Remove(file1); - auto exist = lcm.Exist(file1); + lcm->Write(file1, data, sizeof(data)); + EXPECT_EQ(lcm->GetSizeOfDir(test_dir), sizeof(data)); + lcm->Remove(file1); + auto exist = lcm->Exist(file1); EXPECT_EQ(exist, false); // test get dir size with nested dirs auto nest_dir = test_dir + "/nest_dir"; auto file2 = nest_dir + "/file"; - res = lcm.CreateFile(file2); + res = lcm->CreateFile(file2); EXPECT_EQ(res, true); - lcm.Write(file2, data, sizeof(data)); - EXPECT_EQ(lcm.GetSizeOfDir(test_dir), sizeof(data)); - lcm.RemoveDir(test_dir); + lcm->Write(file2, data, sizeof(data)); + EXPECT_EQ(lcm->GetSizeOfDir(test_dir), sizeof(data)); + lcm->RemoveDir(test_dir); - lcm.RemoveDir(test_dir); - exist = lcm.DirExist(test_dir); + lcm->RemoveDir(test_dir); + exist = lcm->DirExist(test_dir); EXPECT_EQ(exist, false); } diff --git a/internal/core/unittest/test_scalar_index_creator.cpp b/internal/core/unittest/test_scalar_index_creator.cpp index 4c47f892a0..987c6ce87d 100644 --- a/internal/core/unittest/test_scalar_index_creator.cpp +++ b/internal/core/unittest/test_scalar_index_creator.cpp @@ -100,12 +100,19 @@ TYPED_TEST_P(TypedScalarIndexCreatorTest, Constructor) { for (const auto& tp : GenParams()) { auto type_params = tp.first; auto index_params = tp.second; - auto serialized_type_params = generate_type_params(type_params); - auto serialized_index_params = generate_index_params(index_params); + + milvus::Config config; + for (auto iter = index_params.begin(); iter != index_params.end(); + ++iter) { + config[iter->first] = iter->second; + } + for (auto iter = type_params.begin(); iter != type_params.end(); + ++iter) { + config[iter->first] = iter->second; + } + auto creator = milvus::indexbuilder::CreateScalarIndex( - milvus::DataType(dtype), - serialized_type_params.c_str(), - serialized_index_params.c_str()); + milvus::DataType(dtype), config, nullptr); } } @@ -115,19 +122,23 @@ TYPED_TEST_P(TypedScalarIndexCreatorTest, Codec) { for (const auto& tp : GenParams()) { auto type_params = tp.first; auto index_params = tp.second; - auto serialized_type_params = generate_type_params(type_params); - auto serialized_index_params = generate_index_params(index_params); + + milvus::Config config; + for (auto iter = index_params.begin(); iter != index_params.end(); + ++iter) { + config[iter->first] = iter->second; + } + for (auto iter = type_params.begin(); iter != type_params.end(); + ++iter) { + config[iter->first] = iter->second; + } auto creator = milvus::indexbuilder::CreateScalarIndex( - milvus::DataType(dtype), - serialized_type_params.c_str(), - serialized_index_params.c_str()); + milvus::DataType(dtype), config, nullptr); auto arr = GenArr(nb); build_index(creator, arr); auto binary_set = creator->Serialize(); auto copy_creator = milvus::indexbuilder::CreateScalarIndex( - milvus::DataType(dtype), - serialized_type_params.c_str(), - serialized_index_params.c_str()); + milvus::DataType(dtype), config, nullptr); copy_creator->Load(binary_set); } } diff --git a/internal/core/unittest/test_sealed.cpp b/internal/core/unittest/test_sealed.cpp index a719834bb8..6d2ab575bb 100644 --- a/internal/core/unittest/test_sealed.cpp +++ b/internal/core/unittest/test_sealed.cpp @@ -621,22 +621,24 @@ TEST(Sealed, LoadScalarIndex) { LoadFieldDataInfo row_id_info; FieldMeta row_id_field_meta( FieldName("RowID"), RowFieldID, DataType::INT64); - auto array = CreateScalarDataArrayFrom( - dataset.row_ids_.data(), N, row_id_field_meta); - row_id_info.field_data = array.get(); - row_id_info.row_count = dataset.row_ids_.size(); - row_id_info.field_id = RowFieldID.get(); // field id for RowId - segment->LoadFieldData(row_id_info); + auto field_data = + std::make_shared>(DataType::INT64); + field_data->FillFieldData(dataset.row_ids_.data(), N); + auto field_data_info = FieldDataInfo{ + RowFieldID.get(), N, std::vector{field_data}}; + segment->LoadFieldData(RowFieldID, field_data_info); LoadFieldDataInfo ts_info; FieldMeta ts_field_meta( FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - array = - CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta); - ts_info.field_data = array.get(); - ts_info.row_count = dataset.timestamps_.size(); - ts_info.field_id = TimestampFieldID.get(); - segment->LoadFieldData(ts_info); + field_data = + std::make_shared>(DataType::INT64); + field_data->FillFieldData(dataset.timestamps_.data(), N); + field_data_info = + FieldDataInfo{TimestampFieldID.get(), + N, + std::vector{field_data}}; + segment->LoadFieldData(TimestampFieldID, field_data_info); LoadIndexInfo vec_info; vec_info.field_id = fakevec_id.get(); @@ -886,14 +888,6 @@ GenQueryVecs(int N, int dim) { return vecs; } -auto -transfer_to_fields_data(const std::vector& vecs) { - auto arr = std::make_unique(); - *(arr->mutable_vectors()->mutable_float_vector()->mutable_data()) = { - vecs.begin(), vecs.end()}; - return arr; -} - TEST(Sealed, BF) { auto schema = std::make_shared(); auto dim = 128; @@ -904,18 +898,18 @@ TEST(Sealed, BF) { schema->set_primary_field_id(i64_fid); int64_t N = 100000; - auto base = GenRandomFloatVecs(N, dim); - auto base_arr = transfer_to_fields_data(base); - base_arr->set_type(proto::schema::DataType::FloatVector); - - LoadFieldDataInfo load_info{100, base_arr.get(), N}; auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); std::cout << fake_id.get() << std::endl; SealedLoadFieldData(dataset, *segment, {fake_id.get()}); - segment->LoadFieldData(load_info); + auto vec_data = GenRandomFloatVecs(N, dim); + auto field_data = storage::CreateFieldData(DataType::VECTOR_FLOAT, dim); + field_data->FillFieldData(vec_data.data(), N); + auto field_data_info = FieldDataInfo{ + fake_id.get(), N, std::vector{field_data}}; + segment->LoadFieldData(fake_id, field_data_info); auto topK = 1; auto fmt = boost::format(R"(vector_anns: < @@ -961,16 +955,18 @@ TEST(Sealed, BF_Overflow) { schema->set_primary_field_id(i64_fid); int64_t N = 10; - auto base = GenMaxFloatVecs(N, dim); - auto base_arr = transfer_to_fields_data(base); - base_arr->set_type(proto::schema::DataType::FloatVector); - LoadFieldDataInfo load_info{100, base_arr.get(), N}; + auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); std::cout << fake_id.get() << std::endl; SealedLoadFieldData(dataset, *segment, {fake_id.get()}); - segment->LoadFieldData(load_info); + auto vec_data = GenMaxFloatVecs(N, dim); + auto field_data = storage::CreateFieldData(DataType::VECTOR_FLOAT, dim); + field_data->FillFieldData(vec_data.data(), N); + auto field_data_info = FieldDataInfo{ + fake_id.get(), N, std::vector{field_data}}; + segment->LoadFieldData(fake_id, field_data_info); auto topK = 1; auto fmt = boost::format(R"(vector_anns: < diff --git a/internal/core/unittest/test_utils.cpp b/internal/core/unittest/test_utils.cpp index 6eea04eca1..c51c0ebf38 100644 --- a/internal/core/unittest/test_utils.cpp +++ b/internal/core/unittest/test_utils.cpp @@ -63,9 +63,9 @@ TEST(Util, GetDeleteBitmap) { insert_record.insert_pk(1, i); } auto insert_offset = insert_record.reserved.fetch_add(N); - insert_record.timestamps_.fill_chunk_data(tss.data(), N); + insert_record.timestamps_.set_data_raw(insert_offset, tss.data(), N); auto field_data = insert_record.get_field_data_base(i64_fid); - field_data->fill_chunk_data(age_data.data(), N); + field_data->set_data_raw(insert_offset, age_data.data(), N); insert_record.ack_responder_.AddSegment(insert_offset, insert_offset + N); // test case delete pk1(ts = 0) -> insert repeated pk1 (ts = {1 ... N}) -> query (ts = N) diff --git a/internal/core/unittest/test_utils/Constants.h b/internal/core/unittest/test_utils/Constants.h index 77650df1bd..190853a968 100644 --- a/internal/core/unittest/test_utils/Constants.h +++ b/internal/core/unittest/test_utils/Constants.h @@ -10,4 +10,6 @@ // or implied. See the License for the specific language governing permissions and limitations under the License #pragma once -constexpr int64_t TestChunkSize = 32 * 1024; \ No newline at end of file +constexpr int64_t TestChunkSize = 32 * 1024; +constexpr char TestLocalPath[] = "/tmp/milvus/local_data/"; +constexpr char TestRemotePath[] = "/tmp/milvus/remote_data"; diff --git a/internal/core/unittest/test_utils/DataGen.h b/internal/core/unittest/test_utils/DataGen.h index c80c712924..7958399d61 100644 --- a/internal/core/unittest/test_utils/DataGen.h +++ b/internal/core/unittest/test_utils/DataGen.h @@ -577,6 +577,106 @@ SearchResultToJson(const SearchResult& sr) { return json{results}; }; +inline storage::FieldDataPtr +CreateFieldDataFromDataArray(ssize_t raw_count, + const DataArray* data, + const FieldMeta& field_meta) { + int64_t dim = 1; + storage::FieldDataPtr field_data = nullptr; + + auto createFieldData = [&field_data, &raw_count](const void* raw_data, + DataType data_type, + int64_t dim) { + field_data = storage::CreateFieldData(data_type, dim); + field_data->FillFieldData(raw_data, raw_count); + }; + + if (field_meta.is_vector()) { + switch (field_meta.get_data_type()) { + case DataType::VECTOR_FLOAT: { + auto raw_data = data->vectors().float_vector().data().data(); + dim = field_meta.get_dim(); + createFieldData(raw_data, DataType::VECTOR_FLOAT, dim); + break; + } + case DataType::VECTOR_BINARY: { + auto raw_data = data->vectors().binary_vector().data(); + dim = field_meta.get_dim(); + AssertInfo(dim % 8 == 0, "wrong dim value for binary vector"); + createFieldData(raw_data, DataType::VECTOR_BINARY, dim); + break; + } + default: { + PanicInfo("unsupported"); + } + } + } else { + switch (field_meta.get_data_type()) { + case DataType::BOOL: { + auto raw_data = data->scalars().bool_data().data().data(); + createFieldData(raw_data, DataType::BOOL, dim); + break; + } + case DataType::INT8: { + auto src_data = data->scalars().int_data().data(); + std::vector data_raw(src_data.size()); + std::copy_n(src_data.data(), src_data.size(), data_raw.data()); + createFieldData(data_raw.data(), DataType::INT8, dim); + break; + } + case DataType::INT16: { + auto src_data = data->scalars().int_data().data(); + std::vector data_raw(src_data.size()); + std::copy_n(src_data.data(), src_data.size(), data_raw.data()); + createFieldData(data_raw.data(), DataType::INT16, dim); + break; + } + case DataType::INT32: { + auto raw_data = data->scalars().int_data().data().data(); + createFieldData(raw_data, DataType::INT32, dim); + break; + } + case DataType::INT64: { + auto raw_data = data->scalars().long_data().data().data(); + createFieldData(raw_data, DataType::INT64, dim); + break; + } + case DataType::FLOAT: { + auto raw_data = data->scalars().float_data().data().data(); + createFieldData(raw_data, DataType::FLOAT, dim); + break; + } + case DataType::DOUBLE: { + auto raw_data = data->scalars().double_data().data().data(); + createFieldData(raw_data, DataType::DOUBLE, dim); + break; + } + case DataType::VARCHAR: { + auto begin = data->scalars().string_data().data().begin(); + auto end = data->scalars().string_data().data().end(); + std::vector data_raw(begin, end); + createFieldData(data_raw.data(), DataType::VARCHAR, dim); + break; + } + case DataType::JSON: { + auto src_data = data->scalars().json_data().data(); + std::vector data_raw(src_data.size()); + for (int i = 0; i < src_data.size(); i++) { + auto str = src_data.Get(i); + data_raw[i] = Json(simdjson::padded_string(str)); + } + createFieldData(data_raw.data(), DataType::JSON, dim); + break; + } + default: { + PanicInfo("unsupported"); + } + } + } + + return field_data; +} + inline void SealedLoadFieldData(const GeneratedData& dataset, SegmentSealed& seg, @@ -584,39 +684,48 @@ SealedLoadFieldData(const GeneratedData& dataset, bool with_mmap = false) { auto row_count = dataset.row_ids_.size(); { - LoadFieldDataInfo info; - FieldMeta field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); - auto array = CreateScalarDataArrayFrom( - dataset.row_ids_.data(), row_count, field_meta); - info.field_data = array.get(); - info.row_count = dataset.row_ids_.size(); - info.field_id = RowFieldID.get(); // field id for RowId - seg.LoadFieldData(info); + auto field_data = std::make_shared>( + DataType::INT64); + field_data->FillFieldData(dataset.row_ids_.data(), row_count); + auto field_data_info = FieldDataInfo{ + RowFieldID.get(), + int64_t(row_count), + std::vector{field_data}}; + seg.LoadFieldData(RowFieldID, field_data_info); } { - LoadFieldDataInfo info; - FieldMeta field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - auto array = CreateScalarDataArrayFrom( - dataset.timestamps_.data(), row_count, field_meta); - info.field_data = array.get(); - info.row_count = dataset.timestamps_.size(); - info.field_id = TimestampFieldID.get(); - seg.LoadFieldData(info); + auto field_data = std::make_shared>( + DataType::INT64); + field_data->FillFieldData(dataset.timestamps_.data(), row_count); + auto field_data_info = FieldDataInfo{ + TimestampFieldID.get(), + int64_t(row_count), + std::vector{field_data}}; + seg.LoadFieldData(TimestampFieldID, field_data_info); } + for (auto& iter : dataset.schema_->get_fields()) { + int64_t field_id = iter.first.get(); + if (exclude_fields.find(field_id) != exclude_fields.end()) { + continue; + } + } + auto fields = dataset.schema_->get_fields(); for (auto field_data : dataset.raw_->fields_data()) { int64_t field_id = field_data.field_id(); if (exclude_fields.find(field_id) != exclude_fields.end()) { continue; } - LoadFieldDataInfo info; + FieldDataInfo info; if (with_mmap) { info.mmap_dir_path = "./data/mmap-test"; } info.field_id = field_data.field_id(); info.row_count = row_count; - info.field_data = &field_data; - seg.LoadFieldData(info); + auto field_meta = fields.at(FieldId(field_id)); + info.datas.emplace_back( + CreateFieldDataFromDataArray(row_count, &field_data, field_meta)); + + seg.LoadFieldData(FieldId(field_id), info); } } diff --git a/internal/core/unittest/test_utils/indexbuilder_test_utils.h b/internal/core/unittest/test_utils/indexbuilder_test_utils.h index 6989c3710a..07dd43b921 100644 --- a/internal/core/unittest/test_utils/indexbuilder_test_utils.h +++ b/internal/core/unittest/test_utils/indexbuilder_test_utils.h @@ -67,37 +67,10 @@ find_file(const path& dir, const std::string& file_name, path& path_found) { StorageConfig get_default_storage_config() { - char testPath[100]; - auto pwd = std::string(getcwd(testPath, sizeof(testPath))); - path filepath; - auto currentPath = path(pwd); - while (!find_file(currentPath, "milvus.yaml", filepath)) { - currentPath = currentPath.append("../"); - } - auto configPath = filepath.string(); - YAML::Node config; - config = YAML::LoadFile(configPath); - auto minioConfig = config["minio"]; - auto address = minioConfig["address"].as(); - auto port = minioConfig["port"].as(); - auto endpoint = address + ":" + port; - auto accessKey = minioConfig["accessKeyID"].as(); - auto accessValue = minioConfig["secretAccessKey"].as(); - auto rootPath = minioConfig["rootPath"].as(); - auto useSSL = minioConfig["useSSL"].as(); - auto useIam = minioConfig["useIAM"].as(); - auto iamEndPoint = minioConfig["iamEndpoint"].as(); - auto bucketName = minioConfig["bucketName"].as(); - - return StorageConfig{endpoint, - bucketName, - accessKey, - accessValue, - rootPath, - "minio", - iamEndPoint, - useSSL, - useIam}; + StorageConfig storage_config; + storage_config.storage_type = "local"; + storage_config.root_path = TestRemotePath; + return storage_config; } void @@ -106,7 +79,7 @@ delete_cstorage_config(CStorageConfig config) { delete[] config.bucket_name; delete[] config.access_key_id; delete[] config.access_key_value; - delete[] config.remote_root_path; + delete[] config.root_path; delete[] config.storage_type; delete[] config.iam_endpoint; } @@ -165,22 +138,22 @@ class TestConfigWrapper { auto bucketName = minioConfig["bucketName"].as(); std::string storage_type = "minio"; - config_.address = new char[address.length() + 1]; + config_.address = new char[endpoint.length() + 1]; config_.bucket_name = new char[bucketName.length() + 1]; config_.access_key_id = new char[accessKey.length() + 1]; config_.access_key_value = new char[accessValue.length() + 1]; - config_.remote_root_path = new char[rootPath.length() + 1]; + config_.root_path = new char[rootPath.length() + 1]; config_.storage_type = new char[storage_type.length() + 1]; config_.iam_endpoint = new char[iamEndPoint.length() + 1]; config_.useSSL = useSSL; config_.useIAM = useIam; - strcpy(const_cast(config_.address), address.c_str()); + strcpy(const_cast(config_.address), endpoint.c_str()); strcpy(const_cast(config_.bucket_name), bucketName.c_str()); strcpy(const_cast(config_.access_key_id), accessKey.c_str()); strcpy(const_cast(config_.access_key_value), accessValue.c_str()); - strcpy(const_cast(config_.remote_root_path), rootPath.c_str()); + strcpy(const_cast(config_.root_path), rootPath.c_str()); strcpy(const_cast(config_.storage_type), storage_type.c_str()); strcpy(const_cast(config_.iam_endpoint), iamEndPoint.c_str()); } @@ -249,6 +222,7 @@ generate_build_conf(const milvus::IndexType& index_type, {milvus::index::DISK_ANN_SEARCH_LIST_SIZE, std::to_string(128)}, {milvus::index::DISK_ANN_PQ_CODE_BUDGET, std::to_string(0.001)}, {milvus::index::DISK_ANN_BUILD_DRAM_BUDGET, std::to_string(32)}, + {milvus::index::DISK_ANN_BUILD_THREAD_NUM, std::to_string(2)}, }; } return knowhere::Json(); @@ -262,6 +236,7 @@ generate_load_conf(const milvus::IndexType& index_type, return knowhere::Json{ {knowhere::meta::METRIC_TYPE, metric_type}, {knowhere::meta::DIM, std::to_string(DIM)}, + {milvus::index::DISK_ANN_LOAD_THREAD_NUM, std::to_string(2)}, {milvus::index::DISK_ANN_SEARCH_CACHE_BUDGET, std::to_string(0.0002)}, }; @@ -287,8 +262,8 @@ generate_search_conf(const milvus::IndexType& index_type, {knowhere::meta::METRIC_TYPE, metric_type}, }; - if (milvus::index::is_in_list(index_type, - search_with_nprobe_list)) { + if (milvus::is_in_list(index_type, + search_with_nprobe_list)) { conf[knowhere::indexparam::NPROBE] = 4; } else if (index_type == knowhere::IndexEnum::INDEX_HNSW) { conf[knowhere::indexparam::EF] = 200; @@ -313,8 +288,8 @@ generate_range_search_conf(const milvus::IndexType& index_type, conf[knowhere::meta::RANGE_FILTER] = 0.1; } - if (milvus::index::is_in_list(index_type, - search_with_nprobe_list)) { + if (milvus::is_in_list(index_type, + search_with_nprobe_list)) { conf[knowhere::indexparam::NPROBE] = 4; } else if (index_type == knowhere::IndexEnum::INDEX_HNSW) { conf[knowhere::indexparam::EF] = 200; diff --git a/internal/datacoord/index_service.go b/internal/datacoord/index_service.go index c1d2ea9e68..015b059e26 100644 --- a/internal/datacoord/index_service.go +++ b/internal/datacoord/index_service.go @@ -736,6 +736,8 @@ func (s *Server) GetIndexInfos(ctx context.Context, req *indexpb.GetIndexInfoReq if segIdx.IndexState == commonpb.IndexState_Finished { indexFilePaths := metautil.BuildSegmentIndexFilePaths(s.meta.chunkManager.RootPath(), segIdx.BuildID, segIdx.IndexVersion, segIdx.PartitionID, segIdx.SegmentID, segIdx.IndexFileKeys) + indexParams := s.meta.GetIndexParams(segIdx.CollectionID, segIdx.IndexID) + indexParams = append(indexParams, s.meta.GetTypeParams(segIdx.CollectionID, segIdx.IndexID)...) ret.SegmentInfo[segID].IndexInfos = append(ret.SegmentInfo[segID].IndexInfos, &indexpb.IndexFilePathInfo{ SegmentID: segID, @@ -743,7 +745,7 @@ func (s *Server) GetIndexInfos(ctx context.Context, req *indexpb.GetIndexInfoReq IndexID: segIdx.IndexID, BuildID: segIdx.BuildID, IndexName: s.meta.GetIndexNameByID(segIdx.CollectionID, segIdx.IndexID), - IndexParams: s.meta.GetIndexParams(segIdx.CollectionID, segIdx.IndexID), + IndexParams: indexParams, IndexFilePaths: indexFilePaths, SerializedSize: segIdx.IndexSize, IndexVersion: segIdx.IndexVersion, diff --git a/internal/indexnode/errors.go b/internal/indexnode/errors.go index 22b46a6282..4705b2cc89 100644 --- a/internal/indexnode/errors.go +++ b/internal/indexnode/errors.go @@ -23,7 +23,8 @@ import ( ) var ( - ErrNoSuchKey = errors.New("NoSuchKey") + ErrNoSuchKey = errors.New("NoSuchKey") + ErrEmptyInsertPaths = errors.New("empty insert paths") ) // msgIndexNodeIsUnhealthy return a message tha IndexNode is not healthy. diff --git a/internal/indexnode/indexnode.go b/internal/indexnode/indexnode.go index 3677158c63..fe61c4376b 100644 --- a/internal/indexnode/indexnode.go +++ b/internal/indexnode/indexnode.go @@ -138,7 +138,7 @@ func (i *IndexNode) Register() error { return nil } -func (i *IndexNode) initKnowhere() { +func (i *IndexNode) initSegcore() { cEasyloggingYaml := C.CString(path.Join(Params.BaseTable.GetConfigDir(), paramtable.DefaultEasyloggingYaml)) C.IndexBuilderInit(cEasyloggingYaml) C.free(unsafe.Pointer(cEasyloggingYaml)) @@ -159,7 +159,7 @@ func (i *IndexNode) initKnowhere() { C.InitCpuNum(cCPUNum) localDataRootPath := filepath.Join(Params.LocalStorageCfg.Path.GetValue(), typeutil.IndexNodeRole) - initcore.InitLocalStorageConfig(localDataRootPath) + initcore.InitLocalChunkManager(localDataRootPath) } func (i *IndexNode) initSession() error { @@ -193,7 +193,7 @@ func (i *IndexNode) Init() error { log.Info("IndexNode NewMinIOKV succeeded") - i.initKnowhere() + i.initSegcore() }) log.Info("Init IndexNode finished", zap.Error(initErr)) diff --git a/internal/indexnode/indexnode_service_test.go b/internal/indexnode/indexnode_service_test.go index d05ffb8f57..1b3bab174f 100644 --- a/internal/indexnode/indexnode_service_test.go +++ b/internal/indexnode/indexnode_service_test.go @@ -18,343 +18,17 @@ package indexnode import ( "context" - "fmt" - "math/rand" - "sync" "testing" - "time" + + "github.com/stretchr/testify/assert" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" "github.com/milvus-io/milvus/internal/proto/indexpb" "github.com/milvus-io/milvus/internal/proto/internalpb" - "github.com/milvus-io/milvus/pkg/common" - "github.com/milvus-io/milvus/pkg/util/metautil" "github.com/milvus-io/milvus/pkg/util/metricsinfo" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" ) -func genStorageConfig() *indexpb.StorageConfig { - return &indexpb.StorageConfig{ - Address: Params.MinioCfg.Address.GetValue(), - AccessKeyID: Params.MinioCfg.AccessKeyID.GetValue(), - SecretAccessKey: Params.MinioCfg.SecretAccessKey.GetValue(), - BucketName: Params.MinioCfg.BucketName.GetValue(), - RootPath: Params.MinioCfg.RootPath.GetValue(), - IAMEndpoint: Params.MinioCfg.IAMEndpoint.GetValue(), - UseSSL: Params.MinioCfg.UseSSL.GetAsBool(), - UseIAM: Params.MinioCfg.UseIAM.GetAsBool(), - } -} - -func TestIndexNodeSimple(t *testing.T) { - in, err := NewMockIndexNodeComponent(context.TODO()) - require.Nil(t, err) - defer in.Stop() - ctx := context.TODO() - state, err := in.GetComponentStates(ctx) - assert.NoError(t, err) - assert.Equal(t, state.Status.ErrorCode, commonpb.ErrorCode_Success) - assert.Equal(t, state.State.StateCode, commonpb.StateCode_Healthy) - - assert.Nil(t, err, err) - var ( - clusterID = "test-milvus" - idxFilePrefix = "mock_idx" - buildID int64 = 1 - collID int64 = 101 - partID int64 = 201 - segID int64 = 301 - idxID int64 = 401 - idxName = "mock_idx" - vecDim int64 = 8 - typeParams = []*commonpb.KeyValuePair{ - { - Key: common.DimKey, - Value: fmt.Sprintf("%d", vecDim), - }, - } - indexParams = []*commonpb.KeyValuePair{ - { - Key: common.MetricTypeKey, - Value: "L2", - }, - { - Key: common.IndexTypeKey, - Value: "IVF_FLAT", - }, - { - Key: "nlist", - Value: "128", - }, - } - mockChunkMgr = mockChunkMgrIns - ) - - mockChunkMgr.mockFieldData(1000, dim, collID, partID, segID) - t.Run("create job", func(t *testing.T) { - createReq := &indexpb.CreateJobRequest{ - ClusterID: clusterID, - IndexFilePrefix: idxFilePrefix, - BuildID: buildID, - DataPaths: []string{dataPath(collID, partID, segID)}, - IndexVersion: 0, - IndexID: idxID, - IndexName: idxName, - IndexParams: indexParams, - TypeParams: typeParams, - StorageConfig: genStorageConfig(), - } - status, err := in.CreateJob(ctx, createReq) - assert.NoError(t, err) - assert.Equal(t, status.ErrorCode, commonpb.ErrorCode_Success) - }) - - t.Run(("query job"), func(t *testing.T) { - queryJob := &indexpb.QueryJobsRequest{ - ClusterID: clusterID, - BuildIDs: []int64{buildID}, - } - var idxInfo *indexpb.IndexTaskInfo - timeoutCtx, cancel := context.WithTimeout(context.Background(), time.Second*10) - defer cancel() - Loop: - for { - select { - case <-timeoutCtx.Done(): - t.Fatal("timeout for querying jobs") - default: - time.Sleep(1 * time.Millisecond) - resp, err := in.QueryJobs(ctx, queryJob) - assert.NoError(t, err) - assert.Equal(t, resp.Status.ErrorCode, commonpb.ErrorCode_Success) - assert.Equal(t, resp.ClusterID, clusterID) - - for _, indexInfo := range resp.IndexInfos { - if indexInfo.BuildID == buildID { - if indexInfo.State == commonpb.IndexState_Finished { - idxInfo = indexInfo - break Loop - } - } - } - - } - } - - assert.NotNil(t, idxInfo) - for _, idxFileID := range idxInfo.IndexFileKeys { - idxFile := metautil.BuildSegmentIndexFilePath(mockChunkMgr.RootPath(), buildID, 0, - partID, segID, idxFileID) - _, ok := mockChunkMgr.indexedData.Load(idxFile) - assert.True(t, ok) - t.Logf("indexed file: %s", idxFile) - } - - jobNumRet, err := in.GetJobStats(ctx, &indexpb.GetJobStatsRequest{}) - assert.NoError(t, err) - assert.Equal(t, jobNumRet.Status.GetErrorCode(), commonpb.ErrorCode_Success) - assert.Equal(t, jobNumRet.TotalJobNum, int64(0)) - assert.Equal(t, jobNumRet.InProgressJobNum, int64(0)) - assert.Equal(t, jobNumRet.EnqueueJobNum, int64(0)) - assert.Equal(t, jobNumRet.TaskSlots, int64(1)) - assert.Equal(t, len(jobNumRet.JobInfos), 1) - jobInfo := jobNumRet.JobInfos[0] - - assert.True(t, jobInfo.Dim == 8) - assert.True(t, jobInfo.NumRows == 1000) - assert.True(t, jobInfo.PodID == 1) - assert.ElementsMatch(t, jobInfo.IndexParams, indexParams) - }) - - t.Run("drop not exists jobs", func(t *testing.T) { - status, err := in.DropJobs(ctx, &indexpb.DropJobsRequest{ - ClusterID: clusterID, - BuildIDs: []int64{100001}, - }) - assert.NoError(t, err) - assert.Equal(t, status.ErrorCode, commonpb.ErrorCode_Success) - }) -} - -type testTask struct { - buildID int64 - collID int64 - partID int64 - segID int64 - idxID int64 - dim int - rownum int - typeParams []*commonpb.KeyValuePair - idxParams []*commonpb.KeyValuePair -} - -func TestIndexNodeComplex(t *testing.T) { - var ( - clusterID string - buildID0 int64 - collID0 int64 = 10000 - partID0 int64 = 20000 - segID0 int64 = 30000 - idxID0 int64 = 40000 - typesParamsLists = [][]*commonpb.KeyValuePair{ - {{ - Key: common.DimKey, - Value: fmt.Sprintf("%d", 8), - }}, - {{ - Key: common.DimKey, - Value: fmt.Sprintf("%d", 16), - }}, - {{ - Key: common.DimKey, - Value: fmt.Sprintf("%d", 32), - }}, - } - rowNums = []int{100, 1000, 10000} - dims = []int{8, 16, 32} - indexParams = []*commonpb.KeyValuePair{ - { - Key: "nlist", - Value: "128", - }, - { - Key: common.MetricTypeKey, - Value: "L2", - }, - { - Key: common.IndexTypeKey, - Value: "IVF_FLAT", - }, - } - ) - in, err := NewMockIndexNodeComponent(context.TODO()) - require.Nil(t, err) - defer in.Stop() - ctx := context.TODO() - state, err := in.GetComponentStates(ctx) - assert.NoError(t, err) - assert.Equal(t, state.Status.ErrorCode, commonpb.ErrorCode_Success) - assert.Equal(t, state.State.StateCode, commonpb.StateCode_Healthy) - - mockChunkMgr := mockChunkMgrIns - - tasks := make([]*testTask, 0) - var i int64 - t.Logf("preparing mock data...") - wg := sync.WaitGroup{} - for i = 0; i < 10; i++ { - task := &testTask{ - buildID: i + buildID0, - collID: i + collID0, - partID: i + partID0, - segID: i + segID0, - idxID: i + idxID0, - typeParams: typesParamsLists[i%3], - dim: dims[i%3], - rownum: rowNums[i%3], - idxParams: indexParams, - } - wg.Add(1) - go func() { - defer wg.Done() - if rand.Float32() < 0.5 { - mockChunkMgr.mockFieldData(task.rownum, task.dim, task.collID, task.partID, task.segID) - } - }() - tasks = append(tasks, task) - } - wg.Wait() - - t.Logf("start concurent testing") - testwg := sync.WaitGroup{} - for i := 0; i < len(tasks); i++ { - req := &indexpb.CreateJobRequest{ - ClusterID: clusterID, - IndexFilePrefix: "mock_idx", - BuildID: tasks[i].buildID, - DataPaths: []string{dataPath(tasks[i].collID, tasks[i].partID, tasks[i].segID)}, - IndexVersion: 0, - IndexID: tasks[i].idxID, - IndexName: fmt.Sprintf("idx%d", tasks[i].idxID), - IndexParams: tasks[i].idxParams, - TypeParams: tasks[i].typeParams, - StorageConfig: genStorageConfig(), - } - testwg.Add(1) - go func() { - defer testwg.Done() - status, err := in.CreateJob(ctx, req) - assert.NoError(t, err) - assert.Equal(t, status.ErrorCode, commonpb.ErrorCode_Success) - }() - - testwg.Add(1) - go func(idx int) { - defer testwg.Done() - if rand.Float32() < 0.5 { - status, err := in.DropJobs(ctx, &indexpb.DropJobsRequest{ - ClusterID: clusterID, - BuildIDs: []int64{tasks[idx].buildID}, - }) - assert.NoError(t, err) - assert.Equal(t, status.ErrorCode, commonpb.ErrorCode_Success) - } - }(i) - } - testwg.Wait() - timeoutCtx, cancel := context.WithTimeout(ctx, time.Second*30) - defer cancel() -Loop: - for { - select { - case <-timeoutCtx.Done(): - t.Fatal("timeout testing") - default: - jobNumRet, err := in.GetJobStats(ctx, &indexpb.GetJobStatsRequest{}) - assert.NoError(t, err) - assert.Equal(t, jobNumRet.Status.ErrorCode, commonpb.ErrorCode_Success) - if jobNumRet.TotalJobNum == 0 { - break Loop - } - time.Sleep(time.Second) - } - } - buildIDs := make([]int64, 0, len(tasks)) - for _, task := range tasks { - buildIDs = append(buildIDs, task.buildID) - } - jobresp, err := in.QueryJobs(ctx, &indexpb.QueryJobsRequest{ - ClusterID: clusterID, - BuildIDs: buildIDs, - }) - assert.NoError(t, err) - assert.Equal(t, jobresp.Status.ErrorCode, jobresp.Status.ErrorCode) - - for _, job := range jobresp.IndexInfos { - task := tasks[job.BuildID-buildID0] - if job.State == commonpb.IndexState_Finished { - for _, idxFileID := range job.IndexFileKeys { - idxFile := metautil.BuildSegmentIndexFilePath(mockChunkMgr.RootPath(), task.buildID, - 0, task.partID, task.segID, idxFileID) - _, ok := mockChunkMgr.indexedData.Load(idxFile) - assert.True(t, ok) - } - t.Logf("buildID: %d, indexFiles: %v", job.BuildID, job.IndexFileKeys) - } else { - _, ok := mockChunkMgr.indexedData.Load(dataPath(task.collID, task.partID, task.segID)) - assert.False(t, ok) - } - } - - // stop indexnode - assert.Nil(t, in.Stop()) - node := in.(*mockIndexNodeComponent).IndexNode - assert.Equal(t, 0, len(node.tasks)) - assert.Equal(t, commonpb.StateCode_Abnormal, node.lifetime.GetState()) -} - func TestAbnormalIndexNode(t *testing.T) { in, err := NewMockIndexNodeComponent(context.TODO()) assert.NoError(t, err) diff --git a/internal/indexnode/task.go b/internal/indexnode/task.go index fadb2ca59a..2c23529c96 100644 --- a/internal/indexnode/task.go +++ b/internal/indexnode/task.go @@ -40,9 +40,7 @@ import ( "github.com/milvus-io/milvus/pkg/util/funcutil" "github.com/milvus-io/milvus/pkg/util/indexparamcheck" "github.com/milvus-io/milvus/pkg/util/indexparams" - "github.com/milvus-io/milvus/pkg/util/metautil" "github.com/milvus-io/milvus/pkg/util/paramtable" - "github.com/milvus-io/milvus/pkg/util/retry" "github.com/milvus-io/milvus/pkg/util/timerecord" ) @@ -94,6 +92,7 @@ type indexBuildTask struct { partitionID UniqueID segmentID UniqueID fieldID UniqueID + fieldType schemapb.DataType fieldData storage.FieldData indexBlobs []*storage.Blob newTypeParams map[string]string @@ -241,214 +240,144 @@ func (it *indexBuildTask) LoadData(ctx context.Context) error { } func (it *indexBuildTask) BuildIndex(ctx context.Context) error { - // support build diskann index - indexType := it.newIndexParams[common.IndexTypeKey] - if indexType == indexparamcheck.IndexDISKANN { - return it.BuildDiskAnnIndex(ctx) + err := it.parseFieldMetaFromBinlog(ctx) + if err != nil { + log.Ctx(ctx).Warn("parse field meta from binlog failed", zap.Error(err)) + return err } - dataset := indexcgowrapper.GenDataset(it.fieldData) - dType := dataset.DType - var err error - if dType != schemapb.DataType_None { - it.index, err = indexcgowrapper.NewCgoIndex(dType, it.newTypeParams, it.newIndexParams, it.req.GetStorageConfig()) - if err == nil { - err = it.index.Build(dataset) + indexType := it.newIndexParams[common.IndexTypeKey] + if indexType == indexparamcheck.IndexDISKANN { + // check index node support disk index + if !Params.IndexNodeCfg.EnableDisk.GetAsBool() { + log.Ctx(ctx).Warn("IndexNode don't support build disk index", + zap.String("index type", it.newIndexParams[common.IndexTypeKey]), + zap.Bool("enable disk", Params.IndexNodeCfg.EnableDisk.GetAsBool())) + return errors.New("index node don't support build disk index") } + // check load size and size of field data + localUsedSize, err := indexcgowrapper.GetLocalUsedSize(paramtable.Get().LocalStorageCfg.Path.GetValue()) if err != nil { - log.Ctx(ctx).Error("failed to build index", zap.Error(err)) + log.Ctx(ctx).Warn("IndexNode get local used size failed") return err } + fieldDataSize, err := estimateFieldDataSize(it.statistic.Dim, it.req.GetNumRows(), it.fieldType) + if err != nil { + log.Ctx(ctx).Warn("IndexNode get local used size failed") + return err + } + usedLocalSizeWhenBuild := int64(float64(fieldDataSize)*diskUsageRatio) + localUsedSize + maxUsedLocalSize := int64(Params.IndexNodeCfg.DiskCapacityLimit.GetAsFloat() * Params.IndexNodeCfg.MaxDiskUsagePercentage.GetAsFloat()) + + if usedLocalSizeWhenBuild > maxUsedLocalSize { + log.Ctx(ctx).Warn("IndexNode don't has enough disk size to build disk ann index", + zap.Int64("usedLocalSizeWhenBuild", usedLocalSizeWhenBuild), + zap.Int64("maxUsedLocalSize", maxUsedLocalSize)) + return errors.New("index node don't has enough disk size to build disk ann index") + } + + err = indexparams.SetDiskIndexBuildParams(it.newIndexParams, int64(fieldDataSize)) + if err != nil { + log.Ctx(ctx).Warn("failed to fill disk index params", zap.Error(err)) + return err + } + } + + var buildIndexInfo *indexcgowrapper.BuildIndexInfo + buildIndexInfo, err = indexcgowrapper.NewBuildIndexInfo(it.req.GetStorageConfig()) + defer indexcgowrapper.DeleteBuildIndexInfo(buildIndexInfo) + if err != nil { + log.Ctx(ctx).Warn("create build index info failed", zap.Error(err)) + return err + } + err = buildIndexInfo.AppendFieldMetaInfo(it.collectionID, it.partitionID, it.segmentID, it.fieldID, it.fieldType) + if err != nil { + log.Ctx(ctx).Warn("append field meta failed", zap.Error(err)) + return err + } + + err = buildIndexInfo.AppendIndexMetaInfo(it.req.IndexID, it.req.BuildID, it.req.IndexVersion) + if err != nil { + log.Ctx(ctx).Warn("append index meta failed", zap.Error(err)) + return err + } + + err = buildIndexInfo.AppendBuildIndexParam(it.newIndexParams) + if err != nil { + log.Ctx(ctx).Warn("append index params failed", zap.Error(err)) + return err + } + + jsonIndexParams, err := json.Marshal(it.newIndexParams) + if err != nil { + log.Ctx(ctx).Error("failed to json marshal index params", zap.Error(err)) + return err + } + + log.Ctx(ctx).Info("index params are ready", + zap.Int64("buildID", it.BuildID), + zap.String("index params", string(jsonIndexParams))) + + err = buildIndexInfo.AppendBuildTypeParam(it.newTypeParams) + if err != nil { + log.Ctx(ctx).Warn("append type params failed", zap.Error(err)) + return err + } + + for _, path := range it.req.GetDataPaths() { + err = buildIndexInfo.AppendInsertFile(path) + if err != nil { + log.Ctx(ctx).Warn("append insert binlog path failed", zap.Error(err)) + return err + } + } + + it.index, err = indexcgowrapper.CreateIndex(ctx, buildIndexInfo) + if err != nil { + if it.index != nil && it.index.CleanLocalData() != nil { + log.Ctx(ctx).Error("failed to clean cached data on disk after build index failed", + zap.Int64("buildID", it.BuildID), + zap.Int64("index version", it.req.GetIndexVersion())) + } + log.Ctx(ctx).Error("failed to build index", zap.Error(err)) + return err } buildIndexLatency := it.tr.RecordSpan() metrics.IndexNodeKnowhereBuildIndexLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(buildIndexLatency.Milliseconds())) - indexBlobs, err := it.index.Serialize() + log.Ctx(ctx).Info("Successfully build index", zap.Int64("buildID", it.BuildID), zap.Int64("Collection", it.collectionID), zap.Int64("SegmentID", it.segmentID)) + return nil +} + +func (it *indexBuildTask) SaveIndexFiles(ctx context.Context) error { + indexFilePath2Size, err := it.index.UpLoad() if err != nil { - log.Ctx(ctx).Error("IndexNode index Serialize failed", zap.Error(err)) + log.Ctx(ctx).Error("failed to upload index", zap.Error(err)) return err } - - log.Ctx(ctx).Info("index serialize done", zap.Int64("buildID", it.BuildID), - zap.Duration("duration", it.tr.RecordSpan())) - - // use serialized size before encoding - it.serializedSize = 0 - for _, blob := range indexBlobs { - it.serializedSize += uint64(len(blob.Value)) - } + encodeIndexFileDur := it.tr.Record("index serialize and upload done") + metrics.IndexNodeEncodeIndexFileLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(encodeIndexFileDur.Milliseconds())) // early release index for gc, and we can ensure that Delete is idempotent. if err := it.index.Delete(); err != nil { log.Ctx(ctx).Error("IndexNode indexBuildTask Execute CIndexDelete failed", zap.Error(err)) } - var serializedIndexBlobs []*storage.Blob - codec := storage.NewIndexFileBinlogCodec() - serializedIndexBlobs, err = codec.Serialize( - it.req.BuildID, - it.req.IndexVersion, - it.collectionID, - it.partitionID, - it.segmentID, - it.fieldID, - it.newIndexParams, - it.req.IndexName, - it.req.IndexID, - indexBlobs, - ) - if err != nil { - log.Warn("failed to serialize index", zap.Error(err)) - return err - } - encodeIndexFileDur := it.tr.RecordSpan() - metrics.IndexNodeEncodeIndexFileLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(encodeIndexFileDur.Milliseconds())) - it.indexBlobs = serializedIndexBlobs - log.Ctx(ctx).Info("Successfully build index", zap.Int64("buildID", it.BuildID), - zap.Int64("Collection", it.collectionID), zap.Int64("SegmentID", it.segmentID)) - return nil -} - -func (it *indexBuildTask) BuildDiskAnnIndex(ctx context.Context) error { - // check index node support disk index - if !Params.IndexNodeCfg.EnableDisk.GetAsBool() { - log.Ctx(ctx).Error("IndexNode don't support build disk index", - zap.String("index type", it.newIndexParams[common.IndexTypeKey]), - zap.Bool("enable disk", Params.IndexNodeCfg.EnableDisk.GetAsBool())) - return errors.New("index node don't support build disk index") - } - - // check load size and size of field data - localUsedSize, err := indexcgowrapper.GetLocalUsedSize() - if err != nil { - log.Ctx(ctx).Error("IndexNode get local used size failed") - return errors.New("index node get local used size failed") - } - - usedLocalSizeWhenBuild := int64(float64(it.fieldData.GetMemorySize())*diskUsageRatio) + localUsedSize - maxUsedLocalSize := int64(Params.IndexNodeCfg.DiskCapacityLimit.GetAsFloat() * Params.IndexNodeCfg.MaxDiskUsagePercentage.GetAsFloat()) - - if usedLocalSizeWhenBuild > maxUsedLocalSize { - log.Ctx(ctx).Error("IndexNode don't has enough disk size to build disk ann index", - zap.Int64("usedLocalSizeWhenBuild", usedLocalSizeWhenBuild), - zap.Int64("maxUsedLocalSize", maxUsedLocalSize)) - return errors.New("index node don't has enough disk size to build disk ann index") - } - - dataset := indexcgowrapper.GenDataset(it.fieldData) - dType := dataset.DType - if dType != schemapb.DataType_None { - // TODO:: too ugly - it.newIndexParams["collection_id"] = strconv.FormatInt(it.collectionID, 10) - it.newIndexParams["partition_id"] = strconv.FormatInt(it.partitionID, 10) - it.newIndexParams["segment_id"] = strconv.FormatInt(it.segmentID, 10) - it.newIndexParams["field_id"] = strconv.FormatInt(it.fieldID, 10) - it.newIndexParams["index_build_id"] = strconv.FormatInt(it.req.GetBuildID(), 10) - it.newIndexParams["index_id"] = strconv.FormatInt(it.req.IndexID, 10) - it.newIndexParams["index_version"] = strconv.FormatInt(it.req.GetIndexVersion(), 10) - - err = indexparams.SetDiskIndexBuildParams(it.newIndexParams, it.statistic.NumRows) - if err != nil { - log.Ctx(ctx).Error("failed to fill disk index params", zap.Error(err)) - return err - } - jsonIndexParams, err := json.Marshal(it.newIndexParams) - if err != nil { - log.Ctx(ctx).Error("failed to json marshal index params", zap.Error(err)) - return err - } - log.Ctx(ctx).Info("disk index params are ready", - zap.Int64("buildID", it.BuildID), - zap.String("index params", string(jsonIndexParams))) - - it.index, err = indexcgowrapper.NewCgoIndex(dType, it.newTypeParams, it.newIndexParams, it.req.GetStorageConfig()) - if err != nil { - log.Ctx(ctx).Error("failed to create index", zap.Error(err)) - } else { - err = it.index.Build(dataset) - } - - if err != nil { - if it.index != nil && it.index.CleanLocalData() != nil { - log.Ctx(ctx).Error("failed to clean cached data on disk after build index failed", - zap.Int64("buildID", it.BuildID), - zap.Int64("index version", it.req.GetIndexVersion())) - } - log.Ctx(ctx).Error("failed to build index", zap.Error(err)) - return err - } - } - - buildIndexLatency := it.tr.RecordSpan() - metrics.IndexNodeKnowhereBuildIndexLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(buildIndexLatency.Milliseconds())) - - fileInfos, err := it.index.GetIndexFileInfo() - if err != nil { - log.Ctx(ctx).Error("IndexNode index Serialize failed", zap.Error(err)) - return err - } - - log.Ctx(ctx).Info("index serialize done", zap.Int64("buildID", it.BuildID), - zap.Duration("duration", it.tr.RecordSpan())) - // use serialized size before encoding it.serializedSize = 0 - for _, info := range fileInfos { - it.serializedSize += uint64(info.FileSize) - it.indexBlobs = append(it.indexBlobs, &storage.Blob{ - Key: info.FileName, - Size: info.FileSize, - }) + saveFileKeys := make([]string, 0) + for filePath, fileSize := range indexFilePath2Size { + it.serializedSize += uint64(fileSize) + parts := strings.Split(filePath, "/") + fileKey := parts[len(parts)-1] + saveFileKeys = append(saveFileKeys, fileKey) } - // early release index for gc, and we can ensure that Delete is idempotent. - if err := it.index.Delete(); err != nil { - log.Ctx(it.ctx).Error("IndexNode indexBuildTask Execute CIndexDelete failed", zap.Error(err)) - } - - encodeIndexFileDur := it.tr.RecordSpan() - metrics.IndexNodeEncodeIndexFileLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(encodeIndexFileDur.Milliseconds())) - return nil -} - -func (it *indexBuildTask) SaveIndexFiles(ctx context.Context) error { - // support build diskann index - indexType := it.newIndexParams[common.IndexTypeKey] - if indexType == indexparamcheck.IndexDISKANN { - return it.SaveDiskAnnIndexFiles(ctx) - } - - blobCnt := len(it.indexBlobs) - savePaths := make([]string, blobCnt) - saveFileKeys := make([]string, blobCnt) - - saveIndexFile := func(idx int) error { - blob := it.indexBlobs[idx] - savePath := metautil.BuildSegmentIndexFilePath(it.cm.RootPath(), it.req.BuildID, - it.req.IndexVersion, it.partitionID, it.segmentID, blob.Key) - saveFn := func() error { - return it.cm.Write(ctx, savePath, blob.Value) - } - if err := retry.Do(ctx, saveFn, retry.Attempts(5)); err != nil { - log.Ctx(ctx).Warn("index node save index file failed", zap.Error(err), zap.String("savePath", savePath)) - return err - } - savePaths[idx] = savePath - saveFileKeys[idx] = blob.Key - return nil - } - - // If an error occurs, return the error that the task state will be set to retry. - if err := funcutil.ProcessFuncParallel(blobCnt, runtime.NumCPU(), saveIndexFile, "saveIndexFile"); err != nil { - log.Ctx(ctx).Error("saveIndexFile fail") - return err - } - it.savePaths = savePaths it.statistic.EndTime = time.Now().UnixMicro() it.node.storeIndexFilesAndStatistic(it.ClusterID, it.BuildID, saveFileKeys, it.serializedSize, &it.statistic) - log.Ctx(ctx).Info("save index files done", zap.Strings("IndexFiles", savePaths)) + log.Ctx(ctx).Debug("save index files done", zap.Strings("IndexFiles", saveFileKeys)) saveIndexFileDur := it.tr.RecordSpan() metrics.IndexNodeSaveIndexFileLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(saveIndexFileDur.Milliseconds())) it.tr.Elapse("index building all done") @@ -457,63 +386,37 @@ func (it *indexBuildTask) SaveIndexFiles(ctx context.Context) error { return nil } -func (it *indexBuildTask) SaveDiskAnnIndexFiles(ctx context.Context) error { - savePaths := make([]string, len(it.indexBlobs)) - saveFileKeys := make([]string, len(it.indexBlobs)) - - for i, blob := range it.indexBlobs { - savePath := blob.Key - savePaths[i] = savePath - - // TODO: unify blob key to file key instead of full path - parts := strings.Split(blob.Key, "/") - if len(parts) == 0 { - return fmt.Errorf("invaild blob key: %s", blob.Key) +func (it *indexBuildTask) parseFieldMetaFromBinlog(ctx context.Context) error { + toLoadDataPaths := it.req.GetDataPaths() + if len(toLoadDataPaths) == 0 { + return ErrEmptyInsertPaths + } + data, err := it.cm.Read(ctx, toLoadDataPaths[0]) + if err != nil { + if errors.Is(err, ErrNoSuchKey) { + return ErrNoSuchKey } - fileKey := parts[len(parts)-1] - saveFileKeys[i] = fileKey + return err } - // add indexparams file - codec := storage.NewIndexFileBinlogCodec() - indexParamBlob, err := codec.SerializeIndexParams( - it.req.GetBuildID(), - it.req.GetIndexVersion(), - it.collectionID, - it.partitionID, - it.segmentID, - it.fieldID, - it.newIndexParams, - it.req.IndexName, - it.req.IndexID, - ) + var insertCodec storage.InsertCodec + collectionID, partitionID, segmentID, insertData, err := insertCodec.DeserializeAll([]*Blob{{Key: toLoadDataPaths[0], Value: data}}) if err != nil { return err } - - indexParamPath := metautil.BuildSegmentIndexFilePath(it.cm.RootPath(), it.req.BuildID, it.req.IndexVersion, - it.partitionID, it.segmentID, indexParamBlob.Key) - - saveFn := func() error { - return it.cm.Write(ctx, indexParamPath, indexParamBlob.Value) - } - if err := retry.Do(ctx, saveFn, retry.Attempts(5)); err != nil { - log.Ctx(ctx).Warn("index node save index param file failed", zap.Error(err), zap.String("savePath", indexParamPath)) - return err + if len(insertData.Data) != 1 { + return errors.New("we expect only one field in deserialized insert data") } - saveFileKeys = append(saveFileKeys, indexParamBlob.Key) - savePaths = append(savePaths, indexParamPath) - it.savePaths = savePaths + it.collectionID = collectionID + it.partitionID = partitionID + it.segmentID = segmentID + for fID, value := range insertData.Data { + it.fieldType = indexcgowrapper.GenDataset(value).DType + it.fieldID = fID + break + } - it.statistic.EndTime = time.Now().UnixMicro() - it.node.storeIndexFilesAndStatistic(it.ClusterID, it.BuildID, saveFileKeys, it.serializedSize, &it.statistic) - log.Ctx(ctx).Info("save index files done", zap.Strings("IndexFiles", savePaths)) - saveIndexFileDur := it.tr.RecordSpan() - metrics.IndexNodeSaveIndexFileLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(saveIndexFileDur.Milliseconds())) - it.tr.Elapse("index building all done") - log.Ctx(ctx).Info("IndexNode CreateIndex successfully ", zap.Int64("collection", it.collectionID), - zap.Int64("partition", it.partitionID), zap.Int64("segment", it.segmentID)) return nil } diff --git a/internal/indexnode/task_scheduler.go b/internal/indexnode/task_scheduler.go index 103f7ed61a..eb5588cb3b 100644 --- a/internal/indexnode/task_scheduler.go +++ b/internal/indexnode/task_scheduler.go @@ -214,7 +214,7 @@ func (sched *TaskScheduler) processTask(t task, q TaskQueue) { sched.IndexBuildQueue.AddActiveTask(t) defer sched.IndexBuildQueue.PopActiveTask(t.Name()) log.Ctx(t.Ctx()).Debug("process task", zap.String("task", t.Name())) - pipelines := []func(context.Context) error{t.Prepare, t.LoadData, t.BuildIndex, t.SaveIndexFiles} + pipelines := []func(context.Context) error{t.Prepare, t.BuildIndex, t.SaveIndexFiles} for _, fn := range pipelines { if err := wrap(fn); err != nil { if err == errCancel { diff --git a/internal/indexnode/task_scheduler_test.go b/internal/indexnode/task_scheduler_test.go index b0c2bb8ddc..c7bd82e2da 100644 --- a/internal/indexnode/task_scheduler_test.go +++ b/internal/indexnode/task_scheduler_test.go @@ -164,11 +164,9 @@ func TestIndexTaskScheduler(t *testing.T) { tasks = append(tasks, newTask(fakeTaskEnqueued, nil, commonpb.IndexState_Failed), - newTask(fakeTaskLoadedData, nil, commonpb.IndexState_Failed), newTask(fakeTaskPrepared, nil, commonpb.IndexState_Failed), newTask(fakeTaskBuiltIndex, nil, commonpb.IndexState_Failed), newTask(fakeTaskSavedIndexes, nil, commonpb.IndexState_Finished), - newTask(fakeTaskSavedIndexes, map[fakeTaskState]error{fakeTaskLoadedData: ErrNoSuchKey}, commonpb.IndexState_Failed), newTask(fakeTaskSavedIndexes, map[fakeTaskState]error{fakeTaskSavedIndexes: fmt.Errorf("auth failed")}, commonpb.IndexState_Retry)) for _, task := range tasks { @@ -178,12 +176,11 @@ func TestIndexTaskScheduler(t *testing.T) { scheduler.Close() scheduler.wg.Wait() - for _, task := range tasks[:len(tasks)-2] { + for _, task := range tasks[:len(tasks)-1] { assert.Equal(t, task.GetState(), task.(*fakeTask).expectedState) assert.Equal(t, task.Ctx().(*stagectx).curstate, task.Ctx().(*stagectx).state2cancel) } - assert.Equal(t, tasks[len(tasks)-2].GetState(), tasks[len(tasks)-2].(*fakeTask).expectedState) - assert.Equal(t, tasks[len(tasks)-2].Ctx().(*stagectx).curstate, fakeTaskState(fakeTaskLoadedData)) + assert.Equal(t, tasks[len(tasks)-1].GetState(), tasks[len(tasks)-1].(*fakeTask).expectedState) assert.Equal(t, tasks[len(tasks)-1].Ctx().(*stagectx).curstate, fakeTaskState(fakeTaskSavedIndexes)) diff --git a/internal/core/src/config/ConfigChunkManager.cpp b/internal/indexnode/util.go similarity index 62% rename from internal/core/src/config/ConfigChunkManager.cpp rename to internal/indexnode/util.go index ed5c2d5f8e..0ddab2d80b 100644 --- a/internal/core/src/config/ConfigChunkManager.cpp +++ b/internal/indexnode/util.go @@ -14,20 +14,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "config/ConfigChunkManager.h" +package indexnode -namespace milvus::ChunkMangerConfig { +import ( + "unsafe" -std::string LOCAL_ROOT_PATH = "/tmp/milvus"; // NOLINT + "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" +) -void -SetLocalRootPath(const std::string_view path_prefix) { - LOCAL_ROOT_PATH = path_prefix; +func estimateFieldDataSize(dim int64, numRows int64, dataType schemapb.DataType) (uint64, error) { + if dataType == schemapb.DataType_FloatVector { + var value float32 + /* #nosec G103 */ + return uint64(dim) * uint64(numRows) * uint64(unsafe.Sizeof(value)), nil + } + if dataType == schemapb.DataType_BinaryVector { + return uint64(dim) / 8 * uint64(numRows), nil + } + + return 0, nil } - -std::string -GetLocalRootPath() { - return LOCAL_ROOT_PATH; -} - -} // namespace milvus::ChunkMangerConfig diff --git a/internal/proxy/proxy_test.go b/internal/proxy/proxy_test.go index f2791a0de9..e27c80b744 100644 --- a/internal/proxy/proxy_test.go +++ b/internal/proxy/proxy_test.go @@ -139,7 +139,7 @@ func runQueryNode(ctx context.Context, localMsg bool, alias string) *grpcqueryno wg.Add(1) go func() { - factory := dependency.NewDefaultFactory(localMsg) + factory := dependency.MockDefaultFactory(localMsg, Params) var err error qn, err = grpcquerynode.NewServer(ctx, factory) if err != nil { @@ -183,7 +183,7 @@ func runDataNode(ctx context.Context, localMsg bool, alias string) *grpcdatanode wg.Add(1) go func() { - factory := dependency.NewDefaultFactory(localMsg) + factory := dependency.MockDefaultFactory(localMsg, Params) var err error dn, err = grpcdatanode.NewServer(ctx, factory) if err != nil { @@ -207,7 +207,7 @@ func runIndexNode(ctx context.Context, localMsg bool, alias string) *grpcindexno wg.Add(1) go func() { - factory := dependency.NewDefaultFactory(localMsg) + factory := dependency.MockDefaultFactory(localMsg, Params) var err error in, err = grpcindexnode.NewServer(ctx, factory) if err != nil { @@ -347,7 +347,7 @@ func TestProxy(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) ctx = GetContext(ctx, "root:123456") localMsg := true - factory := dependency.NewDefaultFactory(localMsg) + factory := dependency.MockDefaultFactory(localMsg, Params) alias := "TestProxy" log.Info("Initialize parameter table of Proxy") diff --git a/internal/querynodev2/local_worker_test.go b/internal/querynodev2/local_worker_test.go index df9b824012..67370705af 100644 --- a/internal/querynodev2/local_worker_test.go +++ b/internal/querynodev2/local_worker_test.go @@ -73,7 +73,7 @@ func (suite *LocalWorkerTestSuite) BeforeTest(suiteName, testName string) { suite.ctx, suite.cancel = context.WithCancel(context.Background()) // init node - factory := dependency.NewDefaultFactory(true) + factory := dependency.MockDefaultFactory(true, paramtable.Get()) suite.node = NewQueryNode(suite.ctx, factory) // init etcd suite.etcdClient, err = etcd.GetEtcdClient( diff --git a/internal/querynodev2/segments/cgo_util.go b/internal/querynodev2/segments/cgo_util.go index 44a760736e..09e082822d 100644 --- a/internal/querynodev2/segments/cgo_util.go +++ b/internal/querynodev2/segments/cgo_util.go @@ -82,11 +82,13 @@ func GetCProtoBlob(cProto *C.CProto) []byte { return blob } -func GetLocalUsedSize() (int64, error) { +func GetLocalUsedSize(path string) (int64, error) { var availableSize int64 cSize := C.int64_t(availableSize) + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) - status := C.GetLocalUsedSize(&cSize) + status := C.GetLocalUsedSize(cPath, &cSize) err := HandleCStatus(&status, "get local used size failed") if err != nil { return 0, err diff --git a/internal/querynodev2/segments/load_field_data_info.go b/internal/querynodev2/segments/load_field_data_info.go new file mode 100644 index 0000000000..e7a3760698 --- /dev/null +++ b/internal/querynodev2/segments/load_field_data_info.go @@ -0,0 +1,66 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package segments + +/* +#cgo pkg-config: milvus_segcore +#include "segcore/load_field_data_c.h" +*/ +import "C" +import "unsafe" + +type LoadFieldDataInfo struct { + cLoadFieldDataInfo C.CLoadFieldDataInfo +} + +func newLoadFieldDataInfo() (*LoadFieldDataInfo, error) { + var cLoadFieldDataInfo C.CLoadFieldDataInfo + + status := C.NewLoadFieldDataInfo(&cLoadFieldDataInfo) + if err := HandleCStatus(&status, "newLoadFieldDataInfo failed"); err != nil { + return nil, err + } + return &LoadFieldDataInfo{cLoadFieldDataInfo: cLoadFieldDataInfo}, nil +} + +func deleteFieldDataInfo(info *LoadFieldDataInfo) { + C.DeleteLoadFieldDataInfo(info.cLoadFieldDataInfo) +} + +func (ld *LoadFieldDataInfo) appendLoadFieldInfo(fieldID int64, rowCount int64) error { + cFieldID := C.int64_t(fieldID) + cRowCount := C.int64_t(rowCount) + + status := C.AppendLoadFieldInfo(ld.cLoadFieldDataInfo, cFieldID, cRowCount) + return HandleCStatus(&status, "appendLoadFieldInfo failed") +} + +func (ld *LoadFieldDataInfo) appendLoadFieldDataPath(fieldID int64, file string) error { + cFieldID := C.int64_t(fieldID) + cFile := C.CString(file) + defer C.free(unsafe.Pointer(cFile)) + + status := C.AppendLoadFieldDataPath(ld.cLoadFieldDataInfo, cFieldID, cFile) + return HandleCStatus(&status, "appendLoadFieldDataPath failed") +} + +func (ld *LoadFieldDataInfo) appendMMapDirPath(dir string) { + cDir := C.CString(dir) + defer C.free(unsafe.Pointer(cDir)) + + C.AppendMMapDirPath(ld.cLoadFieldDataInfo, cDir) +} diff --git a/internal/querynodev2/segments/load_index_info.go b/internal/querynodev2/segments/load_index_info.go index b3d7c4d862..d463d88107 100644 --- a/internal/querynodev2/segments/load_index_info.go +++ b/internal/querynodev2/segments/load_index_info.go @@ -25,17 +25,13 @@ package segments import "C" import ( - "encoding/json" - "fmt" "path/filepath" "unsafe" - "go.uber.org/zap" - "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/proto/querypb" - "github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/util/funcutil" + "github.com/milvus-io/milvus/pkg/util/indexparamcheck" "github.com/milvus-io/milvus/pkg/util/indexparams" "github.com/milvus-io/milvus/pkg/util/paramtable" ) @@ -49,35 +45,7 @@ type LoadIndexInfo struct { func newLoadIndexInfo() (*LoadIndexInfo, error) { var cLoadIndexInfo C.CLoadIndexInfo - // TODO::xige-16 support embedded milvus - storageType := "minio" - cAddress := C.CString(paramtable.Get().MinioCfg.Address.GetValue()) - cBucketName := C.CString(paramtable.Get().MinioCfg.BucketName.GetValue()) - cAccessKey := C.CString(paramtable.Get().MinioCfg.AccessKeyID.GetValue()) - cAccessValue := C.CString(paramtable.Get().MinioCfg.SecretAccessKey.GetValue()) - cRootPath := C.CString(paramtable.Get().MinioCfg.RootPath.GetValue()) - cStorageType := C.CString(storageType) - cIamEndPoint := C.CString(paramtable.Get().MinioCfg.IAMEndpoint.GetValue()) - defer C.free(unsafe.Pointer(cAddress)) - defer C.free(unsafe.Pointer(cBucketName)) - defer C.free(unsafe.Pointer(cAccessKey)) - defer C.free(unsafe.Pointer(cAccessValue)) - defer C.free(unsafe.Pointer(cRootPath)) - defer C.free(unsafe.Pointer(cStorageType)) - defer C.free(unsafe.Pointer(cIamEndPoint)) - storageConfig := C.CStorageConfig{ - address: cAddress, - bucket_name: cBucketName, - access_key_id: cAccessKey, - access_key_value: cAccessValue, - remote_root_path: cRootPath, - storage_type: cStorageType, - iam_endpoint: cIamEndPoint, - useSSL: C.bool(paramtable.Get().MinioCfg.UseSSL.GetAsBool()), - useIAM: C.bool(paramtable.Get().MinioCfg.UseIAM.GetAsBool()), - } - - status := C.NewLoadIndexInfo(&cLoadIndexInfo, storageConfig) + status := C.NewLoadIndexInfo(&cLoadIndexInfo) if err := HandleCStatus(&status, "NewLoadIndexInfo failed"); err != nil { return nil, err } @@ -105,14 +73,12 @@ func (li *LoadIndexInfo) appendLoadIndexInfo(bytesIndex [][]byte, indexInfo *que // some build params also exist in indexParams, which are useless during loading process indexParams := funcutil.KeyValuePair2Map(indexInfo.IndexParams) - indexparams.SetDiskIndexLoadParams(paramtable.Get(), indexParams, indexInfo.GetNumRows()) - - jsonIndexParams, err := json.Marshal(indexParams) - if err != nil { - err = fmt.Errorf("failed to json marshal index params %w", err) - return err + if indexParams["index_type"] == indexparamcheck.IndexDISKANN { + err = indexparams.SetDiskIndexLoadParams(paramtable.Get(), indexParams, indexInfo.GetNumRows()) + if err != nil { + return err + } } - log.Info("start append index params", zap.String("index params", string(jsonIndexParams))) for key, value := range indexParams { err = li.appendIndexParam(key, value) @@ -168,7 +134,7 @@ func (li *LoadIndexInfo) appendFieldInfo(collectionID int64, partitionID int64, return HandleCStatus(&status, "AppendFieldInfo failed") } -// appendIndexData appends binarySet index to cLoadIndexInfo +// appendIndexData appends index path to cLoadIndexInfo and create index func (li *LoadIndexInfo) appendIndexData(bytesIndex [][]byte, indexKeys []string) error { for _, indexPath := range indexKeys { err := li.appendIndexFile(indexPath) @@ -177,26 +143,30 @@ func (li *LoadIndexInfo) appendIndexData(bytesIndex [][]byte, indexKeys []string } } - var cBinarySet C.CBinarySet - status := C.NewBinarySet(&cBinarySet) - defer C.DeleteBinarySet(cBinarySet) - - if err := HandleCStatus(&status, "NewBinarySet failed"); err != nil { - return err - } - - for i, byteIndex := range bytesIndex { - indexPtr := unsafe.Pointer(&byteIndex[0]) - indexLen := C.int64_t(len(byteIndex)) - binarySetKey := filepath.Base(indexKeys[i]) - indexKey := C.CString(binarySetKey) - status = C.AppendIndexBinary(cBinarySet, indexPtr, indexLen, indexKey) - C.free(unsafe.Pointer(indexKey)) - if err := HandleCStatus(&status, "LoadIndexInfo AppendIndexBinary failed"); err != nil { + if bytesIndex != nil { + var cBinarySet C.CBinarySet + status := C.NewBinarySet(&cBinarySet) + defer C.DeleteBinarySet(cBinarySet) + if err := HandleCStatus(&status, "NewBinarySet failed"); err != nil { return err } + + for i, byteIndex := range bytesIndex { + indexPtr := unsafe.Pointer(&byteIndex[0]) + indexLen := C.int64_t(len(byteIndex)) + binarySetKey := filepath.Base(indexKeys[i]) + indexKey := C.CString(binarySetKey) + status = C.AppendIndexBinary(cBinarySet, indexPtr, indexLen, indexKey) + C.free(unsafe.Pointer(indexKey)) + if err := HandleCStatus(&status, "LoadIndexInfo AppendIndexBinary failed"); err != nil { + return err + } + } + + status = C.AppendIndex(li.cLoadIndexInfo, cBinarySet) + return HandleCStatus(&status, "AppendIndex failed") } - status = C.AppendIndex(li.cLoadIndexInfo, cBinarySet) + status := C.AppendIndexV2(li.cLoadIndexInfo) return HandleCStatus(&status, "AppendIndex failed") } diff --git a/internal/querynodev2/segments/mock_data.go b/internal/querynodev2/segments/mock_data.go index a0d75e0992..0e9d601a7b 100644 --- a/internal/querynodev2/segments/mock_data.go +++ b/internal/querynodev2/segments/mock_data.go @@ -629,7 +629,8 @@ func SaveBinLog(ctx context.Context, } k := JoinIDPath(collectionID, partitionID, segmentID, fieldID) - key := path.Join(defaultLocalStorage, "insert-log", k) + //key := path.Join(defaultLocalStorage, "insert-log", k) + key := path.Join(paramtable.Get().MinioCfg.RootPath.GetValue(), "insert-log", k) kvs[key] = blob.Value fieldBinlog = append(fieldBinlog, &datapb.FieldBinlog{ FieldID: fieldID, @@ -651,7 +652,8 @@ func SaveBinLog(ctx context.Context, } k := JoinIDPath(collectionID, partitionID, segmentID, fieldID) - key := path.Join(defaultLocalStorage, "stats-log", k) + //key := path.Join(defaultLocalStorage, "stats-log", k) + key := path.Join(paramtable.Get().MinioCfg.RootPath.GetValue(), "stats-log", k) kvs[key] = blob.Value[:] statsBinlog = append(statsBinlog, &datapb.FieldBinlog{ FieldID: fieldID, @@ -828,8 +830,8 @@ func SaveDeltaLog(collectionID int64, fieldBinlog := make([]*datapb.FieldBinlog, 0) log.Debug("[query node unittest] save delta log", zap.Int64("fieldID", pkFieldID)) key := JoinIDPath(collectionID, partitionID, segmentID, pkFieldID) - key += "delta" // append suffix 'delta' to avoid conflicts against binlog - keyPath := path.Join(defaultLocalStorage, key) + //keyPath := path.Join(defaultLocalStorage, "delta-log", key) + keyPath := path.Join(paramtable.Get().MinioCfg.RootPath.GetValue(), "delta-log", key) kvs[keyPath] = blob.Value[:] fieldBinlog = append(fieldBinlog, &datapb.FieldBinlog{ FieldID: pkFieldID, @@ -843,7 +845,7 @@ func SaveDeltaLog(collectionID int64, func GenAndSaveIndex(collectionID, partitionID, segmentID, fieldID int64, msgLength int, indexType, metricType string, cm storage.ChunkManager) (*querypb.FieldIndexInfo, error) { typeParams, indexParams := genIndexParams(indexType, metricType) - index, err := indexcgowrapper.NewCgoIndex(schemapb.DataType_FloatVector, typeParams, indexParams, genStorageConfig()) + index, err := indexcgowrapper.NewCgoIndex(schemapb.DataType_FloatVector, typeParams, indexParams) if err != nil { return nil, err } @@ -880,7 +882,9 @@ func GenAndSaveIndex(collectionID, partitionID, segmentID, fieldID int64, msgLen indexPaths := make([]string, 0) for _, index := range serializedIndexBlobs { - indexPath := filepath.Join(defaultLocalStorage, strconv.Itoa(int(segmentID)), index.Key) + //indexPath := filepath.Join(defaultLocalStorage, strconv.Itoa(int(segmentID)), index.Key) + indexPath := filepath.Join(paramtable.Get().MinioCfg.RootPath.GetValue(), "index_files", + strconv.Itoa(int(segmentID)), index.Key) indexPaths = append(indexPaths, indexPath) err := cm.Write(context.Background(), indexPath, index.Value) if err != nil { @@ -942,6 +946,7 @@ func genStorageConfig() *indexpb.StorageConfig { IAMEndpoint: paramtable.Get().MinioCfg.IAMEndpoint.GetValue(), UseSSL: paramtable.Get().MinioCfg.UseSSL.GetAsBool(), UseIAM: paramtable.Get().MinioCfg.UseIAM.GetAsBool(), + StorageType: paramtable.Get().CommonCfg.StorageType.GetValue(), } } diff --git a/internal/querynodev2/segments/reduce_test.go b/internal/querynodev2/segments/reduce_test.go index cc52db3804..db30997224 100644 --- a/internal/querynodev2/segments/reduce_test.go +++ b/internal/querynodev2/segments/reduce_test.go @@ -29,6 +29,7 @@ import ( "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/proto/querypb" storage "github.com/milvus-io/milvus/internal/storage" + "github.com/milvus-io/milvus/internal/util/initcore" "github.com/milvus-io/milvus/pkg/common" "github.com/milvus-io/milvus/pkg/util/funcutil" "github.com/milvus-io/milvus/pkg/util/paramtable" @@ -36,6 +37,7 @@ import ( type ReduceSuite struct { suite.Suite + chunkManager storage.ChunkManager collectionID int64 partitionID int64 @@ -50,6 +52,13 @@ func (suite *ReduceSuite) SetupSuite() { func (suite *ReduceSuite) SetupTest() { var err error + ctx := context.Background() + msgLength := 100 + + chunkManagerFactory := storage.NewChunkManagerFactoryWithParam(paramtable.Get()) + suite.chunkManager, _ = chunkManagerFactory.NewPersistentStorageChunkManager(ctx) + initcore.InitRemoteChunkManager(paramtable.Get()) + suite.collectionID = 100 suite.partitionID = 10 suite.segmentID = 1 @@ -71,14 +80,17 @@ func (suite *ReduceSuite) SetupTest() { ) suite.Require().NoError(err) - insertData, err := genInsertData(100, suite.collection.Schema()) + binlogs, _, err := SaveBinLog(ctx, + suite.collectionID, + suite.partitionID, + suite.segmentID, + msgLength, + schema, + suite.chunkManager, + ) suite.Require().NoError(err) - - insertRecord, err := storage.TransferInsertDataToInsertRecord(insertData) - suite.Require().NoError(err) - numRows := insertRecord.NumRows - for _, fieldData := range insertRecord.FieldsData { - err = suite.segment.LoadField(numRows, fieldData) + for _, binlog := range binlogs { + err = suite.segment.LoadFieldData(binlog.FieldID, int64(msgLength), binlog) suite.Require().NoError(err) } } @@ -86,6 +98,8 @@ func (suite *ReduceSuite) SetupTest() { func (suite *ReduceSuite) TearDownTest() { DeleteSegment(suite.segment) DeleteCollection(suite.collection) + ctx := context.Background() + suite.chunkManager.RemoveWithPrefix(ctx, paramtable.Get().MinioCfg.RootPath.GetValue()) } func (suite *ReduceSuite) TestReduceParseSliceInfo() { diff --git a/internal/querynodev2/segments/retrieve_test.go b/internal/querynodev2/segments/retrieve_test.go index bab52c88b1..2048a9574f 100644 --- a/internal/querynodev2/segments/retrieve_test.go +++ b/internal/querynodev2/segments/retrieve_test.go @@ -25,6 +25,7 @@ import ( "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/storage" + "github.com/milvus-io/milvus/internal/util/initcore" "github.com/milvus-io/milvus/pkg/util/paramtable" ) @@ -32,7 +33,7 @@ type RetrieveSuite struct { suite.Suite // Dependencies - // chunkManager storage.ChunkManager + chunkManager storage.ChunkManager // Data manager *Manager @@ -50,6 +51,13 @@ func (suite *RetrieveSuite) SetupSuite() { func (suite *RetrieveSuite) SetupTest() { var err error + ctx := context.Background() + msgLength := 100 + + chunkManagerFactory := storage.NewChunkManagerFactoryWithParam(paramtable.Get()) + suite.chunkManager, _ = chunkManagerFactory.NewPersistentStorageChunkManager(ctx) + initcore.InitRemoteChunkManager(paramtable.Get()) + suite.collectionID = 100 suite.partitionID = 10 suite.segmentID = 1 @@ -80,6 +88,20 @@ func (suite *RetrieveSuite) SetupTest() { ) suite.Require().NoError(err) + binlogs, _, err := SaveBinLog(ctx, + suite.collectionID, + suite.partitionID, + suite.segmentID, + msgLength, + schema, + suite.chunkManager, + ) + suite.Require().NoError(err) + for _, binlog := range binlogs { + err = suite.sealed.LoadFieldData(binlog.FieldID, int64(msgLength), binlog) + suite.Require().NoError(err) + } + suite.growing, err = NewSegment(suite.collection, suite.segmentID+1, suite.partitionID, @@ -92,19 +114,9 @@ func (suite *RetrieveSuite) SetupTest() { ) suite.Require().NoError(err) - insertData, err := genInsertData(100, suite.collection.Schema()) + insertMsg, err := genInsertMsg(suite.collection, suite.partitionID, suite.growing.segmentID, msgLength) suite.Require().NoError(err) - insertRecord, err := storage.TransferInsertDataToInsertRecord(insertData) - suite.Require().NoError(err) - numRows := insertRecord.NumRows - for _, fieldData := range insertRecord.FieldsData { - err = suite.sealed.LoadField(numRows, fieldData) - suite.Require().NoError(err) - } - - insertMsg, err := genInsertMsg(suite.collection, suite.partitionID, suite.growing.segmentID, 100) - suite.Require().NoError(err) - insertRecord, err = storage.TransferInsertMsgToInsertRecord(suite.collection.Schema(), insertMsg) + insertRecord, err := storage.TransferInsertMsgToInsertRecord(suite.collection.Schema(), insertMsg) suite.Require().NoError(err) err = suite.growing.Insert(insertMsg.RowIDs, insertMsg.Timestamps, insertRecord) suite.Require().NoError(err) @@ -117,6 +129,8 @@ func (suite *RetrieveSuite) TearDownTest() { DeleteSegment(suite.sealed) DeleteSegment(suite.growing) DeleteCollection(suite.collection) + ctx := context.Background() + suite.chunkManager.RemoveWithPrefix(ctx, paramtable.Get().MinioCfg.RootPath.GetValue()) } func (suite *RetrieveSuite) TestRetrieveSealed() { diff --git a/internal/querynodev2/segments/search_test.go b/internal/querynodev2/segments/search_test.go index d4066e8656..8322b0de9a 100644 --- a/internal/querynodev2/segments/search_test.go +++ b/internal/querynodev2/segments/search_test.go @@ -25,11 +25,13 @@ import ( "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/proto/querypb" storage "github.com/milvus-io/milvus/internal/storage" + "github.com/milvus-io/milvus/internal/util/initcore" "github.com/milvus-io/milvus/pkg/util/paramtable" ) type SearchSuite struct { suite.Suite + chunkManager storage.ChunkManager manager *Manager collectionID int64 @@ -46,6 +48,13 @@ func (suite *SearchSuite) SetupSuite() { func (suite *SearchSuite) SetupTest() { var err error + ctx := context.Background() + msgLength := 100 + + chunkManagerFactory := storage.NewChunkManagerFactoryWithParam(paramtable.Get()) + suite.chunkManager, _ = chunkManagerFactory.NewPersistentStorageChunkManager(ctx) + initcore.InitRemoteChunkManager(paramtable.Get()) + suite.collectionID = 100 suite.partitionID = 10 suite.segmentID = 1 @@ -76,6 +85,20 @@ func (suite *SearchSuite) SetupTest() { ) suite.Require().NoError(err) + binlogs, _, err := SaveBinLog(ctx, + suite.collectionID, + suite.partitionID, + suite.segmentID, + msgLength, + schema, + suite.chunkManager, + ) + suite.Require().NoError(err) + for _, binlog := range binlogs { + err = suite.sealed.LoadFieldData(binlog.FieldID, int64(msgLength), binlog) + suite.Require().NoError(err) + } + suite.growing, err = NewSegment(suite.collection, suite.segmentID+1, suite.partitionID, @@ -88,19 +111,9 @@ func (suite *SearchSuite) SetupTest() { ) suite.Require().NoError(err) - insertData, err := genInsertData(100, suite.collection.Schema()) + insertMsg, err := genInsertMsg(suite.collection, suite.partitionID, suite.growing.segmentID, msgLength) suite.Require().NoError(err) - insertRecord, err := storage.TransferInsertDataToInsertRecord(insertData) - suite.Require().NoError(err) - numRows := insertRecord.NumRows - for _, fieldData := range insertRecord.FieldsData { - err = suite.sealed.LoadField(numRows, fieldData) - suite.Require().NoError(err) - } - - insertMsg, err := genInsertMsg(suite.collection, suite.partitionID, suite.growing.segmentID, 100) - suite.Require().NoError(err) - insertRecord, err = storage.TransferInsertMsgToInsertRecord(suite.collection.Schema(), insertMsg) + insertRecord, err := storage.TransferInsertMsgToInsertRecord(suite.collection.Schema(), insertMsg) suite.Require().NoError(err) suite.growing.Insert(insertMsg.RowIDs, insertMsg.Timestamps, insertRecord) @@ -111,6 +124,8 @@ func (suite *SearchSuite) SetupTest() { func (suite *SearchSuite) TearDownTest() { DeleteSegment(suite.sealed) DeleteCollection(suite.collection) + ctx := context.Background() + suite.chunkManager.RemoveWithPrefix(ctx, paramtable.Get().MinioCfg.RootPath.GetValue()) } func (suite *SearchSuite) TestSearchSealed() { diff --git a/internal/querynodev2/segments/segment.go b/internal/querynodev2/segments/segment.go index 406c20ce53..685febf8ef 100644 --- a/internal/querynodev2/segments/segment.go +++ b/internal/querynodev2/segments/segment.go @@ -652,17 +652,7 @@ func (s *LocalSegment) Delete(primaryKeys []storage.PrimaryKey, timestamps []typ } // -------------------------------------------------------------------------------------- interfaces for sealed segment -func (s *LocalSegment) LoadField(rowCount int64, data *schemapb.FieldData) error { - /* - CStatus - LoadFieldData(CSegmentInterface c_segment, CLoadFieldDataInfo load_field_data_info); - */ - if s.Type() != SegmentTypeSealed { - return fmt.Errorf("segmentLoadFieldData failed, illegal segment type=%s, segmentID=%d", - s.Type().String(), - s.ID(), - ) - } +func (s *LocalSegment) LoadMultiFieldData(rowCount int64, fields []*datapb.FieldBinlog) error { s.mut.RLock() defer s.mut.RUnlock() @@ -676,30 +666,81 @@ func (s *LocalSegment) LoadField(rowCount int64, data *schemapb.FieldData) error zap.Int64("segmentID", s.ID()), ) - fieldID := data.GetFieldId() - dataBlob, err := proto.Marshal(data) + loadFieldDataInfo, err := newLoadFieldDataInfo() + defer deleteFieldDataInfo(loadFieldDataInfo) if err != nil { return err } - var mmapDirPath *C.char = nil - path := paramtable.Get().QueryNodeCfg.MmapDirPath.GetValue() - if len(path) > 0 { - mmapDirPath = C.CString(path) - defer C.free(unsafe.Pointer(mmapDirPath)) - } + for _, field := range fields { + fieldID := field.FieldID + err = loadFieldDataInfo.appendLoadFieldInfo(fieldID, rowCount) + if err != nil { + return err + } - loadInfo := C.CLoadFieldDataInfo{ - field_id: C.int64_t(fieldID), - blob: (*C.uint8_t)(unsafe.Pointer(&dataBlob[0])), - blob_size: C.uint64_t(len(dataBlob)), - row_count: C.int64_t(rowCount), - mmap_dir_path: mmapDirPath, + for _, binlog := range field.Binlogs { + err = loadFieldDataInfo.appendLoadFieldDataPath(fieldID, binlog.GetLogPath()) + if err != nil { + return err + } + } + + loadFieldDataInfo.appendMMapDirPath(paramtable.Get().QueryNodeCfg.MmapDirPath.GetValue()) } var status C.CStatus GetPool().Submit(func() (any, error) { - status = C.LoadFieldData(s.ptr, loadInfo) + status = C.LoadFieldData(s.ptr, loadFieldDataInfo.cLoadFieldDataInfo) + return nil, nil + }).Await() + if err := HandleCStatus(&status, "LoadMultiFieldData failed"); err != nil { + return err + } + + log.Info("load mutil field done", + zap.Int64("row count", rowCount), + zap.Int64("segmentID", s.ID())) + + return nil +} + +func (s *LocalSegment) LoadFieldData(fieldID int64, rowCount int64, field *datapb.FieldBinlog) error { + s.mut.RLock() + defer s.mut.RUnlock() + + if s.ptr == nil { + return WrapSegmentReleased(s.segmentID) + } + + log := log.With( + zap.Int64("collectionID", s.Collection()), + zap.Int64("partitionID", s.Partition()), + zap.Int64("segmentID", s.ID()), + ) + + loadFieldDataInfo, err := newLoadFieldDataInfo() + defer deleteFieldDataInfo(loadFieldDataInfo) + if err != nil { + return err + } + + err = loadFieldDataInfo.appendLoadFieldInfo(fieldID, rowCount) + if err != nil { + return err + } + + for _, binlog := range field.Binlogs { + err = loadFieldDataInfo.appendLoadFieldDataPath(fieldID, binlog.GetLogPath()) + if err != nil { + return err + } + } + loadFieldDataInfo.appendMMapDirPath(paramtable.Get().QueryNodeCfg.MmapDirPath.GetValue()) + + var status C.CStatus + GetPool().Submit(func() (any, error) { + status = C.LoadFieldData(s.ptr, loadFieldDataInfo.cLoadFieldDataInfo) return nil, nil }).Await() if err := HandleCStatus(&status, "LoadFieldData failed"); err != nil { @@ -708,8 +749,8 @@ func (s *LocalSegment) LoadField(rowCount int64, data *schemapb.FieldData) error log.Info("load field done", zap.Int64("fieldID", fieldID), - zap.Int64("rowCount", rowCount), - ) + zap.Int64("row count", rowCount), + zap.Int64("segmentID", s.ID())) return nil } @@ -836,3 +877,51 @@ func (s *LocalSegment) LoadIndex(bytesIndex [][]byte, indexInfo *querypb.FieldIn return nil } + +func (s *LocalSegment) LoadIndexData(indexInfo *querypb.FieldIndexInfo, fieldType schemapb.DataType) error { + loadIndexInfo, err := newLoadIndexInfo() + defer deleteLoadIndexInfo(loadIndexInfo) + if err != nil { + return err + } + + err = loadIndexInfo.appendLoadIndexInfo(nil, indexInfo, s.collectionID, s.partitionID, s.segmentID, fieldType) + if err != nil { + if loadIndexInfo.cleanLocalData() != nil { + log.Warn("failed to clean cached data on disk after append index failed", + zap.Int64("buildID", indexInfo.BuildID), + zap.Int64("index version", indexInfo.IndexVersion)) + } + return err + } + if s.Type() != SegmentTypeSealed { + errMsg := fmt.Sprintln("updateSegmentIndex failed, illegal segment type ", s.typ, "segmentID = ", s.ID()) + return errors.New(errMsg) + } + s.mut.RLock() + defer s.mut.RUnlock() + + if s.ptr == nil { + return WrapSegmentReleased(s.segmentID) + } + + log := log.With( + zap.Int64("collectionID", s.Collection()), + zap.Int64("partitionID", s.Partition()), + zap.Int64("segmentID", s.ID()), + ) + + var status C.CStatus + GetPool().Submit(func() (any, error) { + status = C.UpdateSealedSegmentIndex(s.ptr, loadIndexInfo.cLoadIndexInfo) + return nil, nil + }).Await() + + if err := HandleCStatus(&status, "UpdateSealedSegmentIndex failed"); err != nil { + return err + } + + log.Info("updateSegmentIndex done", zap.Int64("fieldID", indexInfo.FieldID)) + + return nil +} diff --git a/internal/querynodev2/segments/segment_loader.go b/internal/querynodev2/segments/segment_loader.go index 7821a941e2..1922efccdb 100644 --- a/internal/querynodev2/segments/segment_loader.go +++ b/internal/querynodev2/segments/segment_loader.go @@ -441,7 +441,7 @@ func (loader *segmentLoader) loadSegment(ctx context.Context, if err := loader.loadFieldsIndex(ctx, segment, indexedFieldInfos); err != nil { return err } - if err := loader.loadSealedSegmentFields(ctx, segment, fieldBinlogs, loadInfo); err != nil { + if err := loader.loadSealedSegmentFields(ctx, segment, fieldBinlogs, loadInfo.GetNumOfRows()); err != nil { return err } // https://github.com/milvus-io/milvus/23654 @@ -450,7 +450,7 @@ func (loader *segmentLoader) loadSegment(ctx context.Context, return err } } else { - if err := loader.loadGrowingSegmentFields(ctx, segment, loadInfo.BinlogPaths); err != nil { + if err := segment.LoadMultiFieldData(loadInfo.GetNumOfRows(), loadInfo.BinlogPaths); err != nil { return err } } @@ -489,72 +489,13 @@ func (loader *segmentLoader) filterPKStatsBinlogs(fieldBinlogs []*datapb.FieldBi return result, storage.DefaultStatsType } -func (loader *segmentLoader) loadGrowingSegmentFields(ctx context.Context, segment *LocalSegment, fieldBinlogs []*datapb.FieldBinlog) error { - if len(fieldBinlogs) <= 0 { - return nil - } - - segmentType := segment.Type() - iCodec := storage.InsertCodec{} - - // change all field bin log loading into concurrent - loadFutures := make([]*conc.Future[*storage.Blob], 0, len(fieldBinlogs)) - for _, fieldBinlog := range fieldBinlogs { - futures := loader.loadFieldBinlogsAsync(ctx, fieldBinlog) - loadFutures = append(loadFutures, futures...) - } - - // wait for async load results - blobs := make([]*storage.Blob, len(loadFutures)) - for index, future := range loadFutures { - if !future.OK() { - return future.Err() - } - - blobs[index] = future.Value() - } - log.Info("log field binlogs done", - zap.Int64("collection", segment.collectionID), - zap.Int64("segment", segment.segmentID), - zap.Any("field", fieldBinlogs), - zap.String("segmentType", segmentType.String())) - - _, _, insertData, err := iCodec.Deserialize(blobs) - if err != nil { - log.Warn("failed to deserialize", zap.Int64("segment", segment.segmentID), zap.Error(err)) - return err - } - - switch segmentType { - case SegmentTypeGrowing: - tsData, ok := insertData.Data[common.TimeStampField] - if !ok { - return errors.New("cannot get timestamps from insert data") - } - utss := make([]uint64, tsData.RowNum()) - for i := 0; i < tsData.RowNum(); i++ { - utss[i] = uint64(tsData.GetRow(i).(int64)) - } - - rowIDData, ok := insertData.Data[common.RowIDField] - if !ok { - return errors.New("cannot get row ids from insert data") - } - - return loader.insertIntoSegment(segment, rowIDData.(*storage.Int64FieldData).Data, utss, insertData) - - default: - err := fmt.Errorf("illegal segmentType=%s when load segment, collectionID=%v", segmentType.String(), segment.collectionID) - return err - } -} - -func (loader *segmentLoader) loadSealedSegmentFields(ctx context.Context, segment *LocalSegment, fields []*datapb.FieldBinlog, loadInfo *querypb.SegmentLoadInfo) error { - runningGroup, groupCtx := errgroup.WithContext(ctx) +func (loader *segmentLoader) loadSealedSegmentFields(ctx context.Context, segment *LocalSegment, fields []*datapb.FieldBinlog, rowCount int64) error { + runningGroup, _ := errgroup.WithContext(ctx) for _, field := range fields { fieldBinLog := field + fieldID := field.FieldID runningGroup.Go(func() error { - return loader.loadSealedField(groupCtx, segment, fieldBinLog, loadInfo) + return segment.LoadFieldData(fieldID, rowCount, fieldBinLog) }) } err := runningGroup.Wait() @@ -571,36 +512,6 @@ func (loader *segmentLoader) loadSealedSegmentFields(ctx context.Context, segmen return nil } -// async load field of sealed segment -func (loader *segmentLoader) loadSealedField(ctx context.Context, segment *LocalSegment, field *datapb.FieldBinlog, loadInfo *querypb.SegmentLoadInfo) error { - iCodec := storage.InsertCodec{} - - // Avoid consuming too much memory if no CPU worker ready, - // acquire a CPU worker before load field binlogs - futures := loader.loadFieldBinlogsAsync(ctx, field) - - err := conc.AwaitAll(futures...) - if err != nil { - return err - } - - blobs := make([]*storage.Blob, len(futures)) - for index, future := range futures { - blobs[index] = future.Value() - } - - insertData := storage.InsertData{ - Data: make(map[int64]storage.FieldData), - } - _, _, _, err = iCodec.DeserializeInto(blobs, int(loadInfo.GetNumOfRows()), &insertData) - if err != nil { - log.Warn("failed to load sealed field", zap.Int64("SegmentId", segment.segmentID), zap.Error(err)) - return err - } - - return loader.loadSealedSegments(segment, &insertData) -} - // Load binlogs concurrently into memory from KV storage asyncly func (loader *segmentLoader) loadFieldBinlogsAsync(ctx context.Context, field *datapb.FieldBinlog) []*conc.Future[*storage.Blob] { futures := make([]*conc.Future[*storage.Blob], 0, len(field.Binlogs)) @@ -647,80 +558,22 @@ func (loader *segmentLoader) loadFieldsIndex(ctx context.Context, segment *Local } func (loader *segmentLoader) loadFieldIndex(ctx context.Context, segment *LocalSegment, indexInfo *querypb.FieldIndexInfo) error { - indexBuffer := make([][]byte, 0, len(indexInfo.IndexFilePaths)) filteredPaths := make([]string, 0, len(indexInfo.IndexFilePaths)) - futures := make([]*conc.Future[*storage.Blob], 0, len(indexInfo.IndexFilePaths)) - indexCodec := storage.NewIndexFileBinlogCodec() - // TODO, remove the load index info froam for _, indexPath := range indexInfo.IndexFilePaths { - // get index params when detecting indexParamPrefix - if path.Base(indexPath) == storage.IndexParamsKey { - log.Info("load index params file", zap.String("path", indexPath)) - indexParamsBlob, err := loader.cm.Read(ctx, indexPath) - if err != nil { - return err - } - - _, indexParams, _, _, err := indexCodec.Deserialize([]*storage.Blob{{Key: storage.IndexParamsKey, Value: indexParamsBlob}}) - if err != nil { - return err - } - - // update index params(dim...) - newIndexParams := funcutil.KeyValuePair2Map(indexInfo.IndexParams) - for key, value := range indexParams { - newIndexParams[key] = value - } - indexInfo.IndexParams = funcutil.Map2KeyValuePair(newIndexParams) - continue + if path.Base(indexPath) != storage.IndexParamsKey { + filteredPaths = append(filteredPaths, indexPath) } - - filteredPaths = append(filteredPaths, indexPath) } - // 2. use index bytes and index path to update segment + // 2. use index path to update segment indexInfo.IndexFilePaths = filteredPaths fieldType, err := loader.getFieldType(segment, indexInfo.FieldID) if err != nil { return err } - indexParams := funcutil.KeyValuePair2Map(indexInfo.IndexParams) - // load on disk index - if indexParams[common.IndexTypeKey] == indexparamcheck.IndexDISKANN { - return segment.LoadIndex(nil, indexInfo, fieldType) - } - // load in memory index - for _, p := range indexInfo.IndexFilePaths { - indexPath := p - indexFuture := loader.ioPool.Submit(func() (*storage.Blob, error) { - log.Info("load index file", zap.String("path", indexPath)) - data, err := loader.cm.Read(ctx, indexPath) - if err != nil { - log.Warn("failed to load index file", zap.String("path", indexPath), zap.Error(err)) - return nil, err - } - blobs, _, _, _, err := indexCodec.Deserialize([]*storage.Blob{{Key: path.Base(indexPath), Value: data}}) - if err != nil { - return nil, err - } - return blobs[0], nil - }) - - futures = append(futures, indexFuture) - } - - err = conc.AwaitAll(futures...) - if err != nil { - return err - } - - for _, index := range futures { - indexBuffer = append(indexBuffer, index.Value().GetValue()) - } - - return segment.LoadIndex(indexBuffer, indexInfo, fieldType) + return segment.LoadIndexData(indexInfo, fieldType) } func (loader *segmentLoader) insertIntoSegment(segment *LocalSegment, @@ -775,22 +628,6 @@ func (loader *segmentLoader) insertIntoSegment(segment *LocalSegment, return nil } -func (loader *segmentLoader) loadSealedSegments(segment *LocalSegment, insertData *storage.InsertData) error { - insertRecord, err := storage.TransferInsertDataToInsertRecord(insertData) - if err != nil { - return err - } - numRows := insertRecord.NumRows - for _, fieldData := range insertRecord.FieldsData { - err := segment.LoadField(numRows, fieldData) - if err != nil { - // TODO: return or continue? - return err - } - } - return nil -} - func (loader *segmentLoader) loadBloomFilter(ctx context.Context, segmentID int64, bfs *pkoracle.BloomFilterSet, binlogPaths []string, logType storage.StatsLogType) error { @@ -983,7 +820,7 @@ func (loader *segmentLoader) checkSegmentSize(segmentLoadInfos []*querypb.Segmen usedMemAfterLoad := usedMem maxSegmentSize := uint64(0) - localUsedSize, err := GetLocalUsedSize() + localUsedSize, err := GetLocalUsedSize(paramtable.Get().LocalStorageCfg.Path.GetValue()) if err != nil { return 0, 0, errors.Wrap(err, "get local used size failed") } diff --git a/internal/querynodev2/segments/segment_loader_test.go b/internal/querynodev2/segments/segment_loader_test.go index d4c423ddcf..a05154a93c 100644 --- a/internal/querynodev2/segments/segment_loader_test.go +++ b/internal/querynodev2/segments/segment_loader_test.go @@ -18,7 +18,6 @@ package segments import ( "context" - "fmt" "math/rand" "testing" @@ -27,6 +26,7 @@ import ( "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/storage" + "github.com/milvus-io/milvus/internal/util/initcore" "github.com/milvus-io/milvus/pkg/util/funcutil" "github.com/milvus-io/milvus/pkg/util/paramtable" ) @@ -44,6 +44,7 @@ type SegmentLoaderSuite struct { partitionID int64 segmentID int64 schema *schemapb.CollectionSchema + segmentNum int } func (suite *SegmentLoaderSuite) SetupSuite() { @@ -51,16 +52,21 @@ func (suite *SegmentLoaderSuite) SetupSuite() { suite.collectionID = rand.Int63() suite.partitionID = rand.Int63() suite.segmentID = rand.Int63() + suite.segmentNum = 5 suite.schema = GenTestCollectionSchema("test", schemapb.DataType_Int64) } func (suite *SegmentLoaderSuite) SetupTest() { // Dependencies suite.manager = NewManager() - suite.chunkManager = storage.NewLocalChunkManager(storage.RootPath( - fmt.Sprintf("/tmp/milvus-ut/%d", rand.Int63()))) - + ctx := context.Background() + // TODO:: cpp chunk manager not support local chunk manager + //suite.chunkManager = storage.NewLocalChunkManager(storage.RootPath( + // fmt.Sprintf("/tmp/milvus-ut/%d", rand.Int63()))) + chunkManagerFactory := storage.NewChunkManagerFactoryWithParam(paramtable.Get()) + suite.chunkManager, _ = chunkManagerFactory.NewPersistentStorageChunkManager(ctx) suite.loader = NewLoader(suite.manager, suite.chunkManager) + initcore.InitRemoteChunkManager(paramtable.Get()) // Data schema := GenTestCollectionSchema("test", schemapb.DataType_Int64) @@ -73,15 +79,25 @@ func (suite *SegmentLoaderSuite) SetupTest() { suite.manager.Collection.Put(suite.collectionID, schema, indexMeta, loadMeta) } +func (suite *SegmentLoaderSuite) TearDownTest() { + ctx := context.Background() + for i := 0; i < suite.segmentNum; i++ { + suite.manager.Segment.Remove(suite.segmentID+int64(i), querypb.DataScope_All) + } + suite.chunkManager.RemoveWithPrefix(ctx, paramtable.Get().MinioCfg.RootPath.GetValue()) +} + func (suite *SegmentLoaderSuite) TestLoad() { ctx := context.Background() + msgLength := 4 + // Load sealed binlogs, statsLogs, err := SaveBinLog(ctx, suite.collectionID, suite.partitionID, suite.segmentID, - 4, + msgLength, suite.schema, suite.chunkManager, ) @@ -93,6 +109,7 @@ func (suite *SegmentLoaderSuite) TestLoad() { CollectionID: suite.collectionID, BinlogPaths: binlogs, Statslogs: statsLogs, + NumOfRows: int64(msgLength), }) suite.NoError(err) @@ -101,7 +118,7 @@ func (suite *SegmentLoaderSuite) TestLoad() { suite.collectionID, suite.partitionID, suite.segmentID+1, - 4, + msgLength, suite.schema, suite.chunkManager, ) @@ -113,23 +130,24 @@ func (suite *SegmentLoaderSuite) TestLoad() { CollectionID: suite.collectionID, BinlogPaths: binlogs, Statslogs: statsLogs, + NumOfRows: int64(msgLength), }) suite.NoError(err) } func (suite *SegmentLoaderSuite) TestLoadMultipleSegments() { ctx := context.Background() - const SegmentNum = 5 - loadInfos := make([]*querypb.SegmentLoadInfo, 0, SegmentNum) + loadInfos := make([]*querypb.SegmentLoadInfo, 0, suite.segmentNum) + msgLength := 100 // Load sealed - for i := 0; i < SegmentNum; i++ { + for i := 0; i < suite.segmentNum; i++ { segmentID := suite.segmentID + int64(i) binlogs, statsLogs, err := SaveBinLog(ctx, suite.collectionID, suite.partitionID, segmentID, - 100, + msgLength, suite.schema, suite.chunkManager, ) @@ -140,6 +158,7 @@ func (suite *SegmentLoaderSuite) TestLoadMultipleSegments() { CollectionID: suite.collectionID, BinlogPaths: binlogs, Statslogs: statsLogs, + NumOfRows: int64(msgLength), }) } @@ -156,13 +175,13 @@ func (suite *SegmentLoaderSuite) TestLoadMultipleSegments() { // Load growing loadInfos = loadInfos[:0] - for i := 0; i < SegmentNum; i++ { - segmentID := suite.segmentID + SegmentNum + int64(i) + for i := 0; i < suite.segmentNum; i++ { + segmentID := suite.segmentID + int64(suite.segmentNum) + int64(i) binlogs, statsLogs, err := SaveBinLog(ctx, suite.collectionID, suite.partitionID, segmentID, - 100, + msgLength, suite.schema, suite.chunkManager, ) @@ -173,6 +192,7 @@ func (suite *SegmentLoaderSuite) TestLoadMultipleSegments() { CollectionID: suite.collectionID, BinlogPaths: binlogs, Statslogs: statsLogs, + NumOfRows: int64(msgLength), }) } @@ -190,17 +210,17 @@ func (suite *SegmentLoaderSuite) TestLoadMultipleSegments() { func (suite *SegmentLoaderSuite) TestLoadWithIndex() { ctx := context.Background() - const SegmentNum = 5 - loadInfos := make([]*querypb.SegmentLoadInfo, 0, SegmentNum) + loadInfos := make([]*querypb.SegmentLoadInfo, 0, suite.segmentNum) + msgLength := 100 // Load sealed - for i := 0; i < SegmentNum; i++ { + for i := 0; i < suite.segmentNum; i++ { segmentID := suite.segmentID + int64(i) binlogs, statsLogs, err := SaveBinLog(ctx, suite.collectionID, suite.partitionID, segmentID, - 100, + msgLength, suite.schema, suite.chunkManager, ) @@ -212,7 +232,7 @@ func (suite *SegmentLoaderSuite) TestLoadWithIndex() { suite.partitionID, segmentID, vecFields[0], - 100, + msgLength, IndexFaissIVFFlat, L2, suite.chunkManager, @@ -225,6 +245,7 @@ func (suite *SegmentLoaderSuite) TestLoadWithIndex() { BinlogPaths: binlogs, Statslogs: statsLogs, IndexInfos: []*querypb.FieldIndexInfo{indexInfo}, + NumOfRows: int64(msgLength), }) } @@ -239,17 +260,17 @@ func (suite *SegmentLoaderSuite) TestLoadWithIndex() { func (suite *SegmentLoaderSuite) TestLoadBloomFilter() { ctx := context.Background() - const SegmentNum = 5 - loadInfos := make([]*querypb.SegmentLoadInfo, 0, SegmentNum) + loadInfos := make([]*querypb.SegmentLoadInfo, 0, suite.segmentNum) + msgLength := 100 // Load sealed - for i := 0; i < SegmentNum; i++ { + for i := 0; i < suite.segmentNum; i++ { segmentID := suite.segmentID + int64(i) binlogs, statsLogs, err := SaveBinLog(ctx, suite.collectionID, suite.partitionID, segmentID, - 100, + msgLength, suite.schema, suite.chunkManager, ) @@ -261,6 +282,7 @@ func (suite *SegmentLoaderSuite) TestLoadBloomFilter() { CollectionID: suite.collectionID, BinlogPaths: binlogs, Statslogs: statsLogs, + NumOfRows: int64(msgLength), }) } @@ -277,17 +299,17 @@ func (suite *SegmentLoaderSuite) TestLoadBloomFilter() { func (suite *SegmentLoaderSuite) TestLoadDeltaLogs() { ctx := context.Background() - const SegmentNum = 5 - loadInfos := make([]*querypb.SegmentLoadInfo, 0, SegmentNum) + loadInfos := make([]*querypb.SegmentLoadInfo, 0, suite.segmentNum) + msgLength := 100 // Load sealed - for i := 0; i < SegmentNum; i++ { + for i := 0; i < suite.segmentNum; i++ { segmentID := suite.segmentID + int64(i) binlogs, statsLogs, err := SaveBinLog(ctx, suite.collectionID, suite.partitionID, segmentID, - 100, + msgLength, suite.schema, suite.chunkManager, ) @@ -308,6 +330,7 @@ func (suite *SegmentLoaderSuite) TestLoadDeltaLogs() { BinlogPaths: binlogs, Statslogs: statsLogs, Deltalogs: deltaLogs, + NumOfRows: int64(msgLength), }) } @@ -332,12 +355,13 @@ func (suite *SegmentLoaderSuite) TestLoadWithMmap() { defer paramtable.Get().Reset(key) ctx := context.Background() + msgLength := 100 // Load sealed binlogs, statsLogs, err := SaveBinLog(ctx, suite.collectionID, suite.partitionID, suite.segmentID, - 100, + msgLength, suite.schema, suite.chunkManager, ) @@ -349,6 +373,7 @@ func (suite *SegmentLoaderSuite) TestLoadWithMmap() { CollectionID: suite.collectionID, BinlogPaths: binlogs, Statslogs: statsLogs, + NumOfRows: int64(msgLength), }) suite.NoError(err) } @@ -356,12 +381,13 @@ func (suite *SegmentLoaderSuite) TestLoadWithMmap() { func (suite *SegmentLoaderSuite) TestPatchEntryNum() { ctx := context.Background() + msgLength := 100 segmentID := suite.segmentID binlogs, statsLogs, err := SaveBinLog(ctx, suite.collectionID, suite.partitionID, segmentID, - 100, + msgLength, suite.schema, suite.chunkManager, ) @@ -373,7 +399,7 @@ func (suite *SegmentLoaderSuite) TestPatchEntryNum() { suite.partitionID, segmentID, vecFields[0], - 100, + msgLength, IndexFaissIVFFlat, L2, suite.chunkManager, @@ -386,6 +412,7 @@ func (suite *SegmentLoaderSuite) TestPatchEntryNum() { BinlogPaths: binlogs, Statslogs: statsLogs, IndexInfos: []*querypb.FieldIndexInfo{indexInfo}, + NumOfRows: int64(msgLength), } // mock legacy binlog entry num is zero case diff --git a/internal/querynodev2/segments/segment_test.go b/internal/querynodev2/segments/segment_test.go index e4e1900c63..808dd05bc1 100644 --- a/internal/querynodev2/segments/segment_test.go +++ b/internal/querynodev2/segments/segment_test.go @@ -10,12 +10,14 @@ import ( "github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/segcorepb" storage "github.com/milvus-io/milvus/internal/storage" + "github.com/milvus-io/milvus/internal/util/initcore" "github.com/milvus-io/milvus/pkg/util/funcutil" "github.com/milvus-io/milvus/pkg/util/paramtable" ) type SegmentSuite struct { suite.Suite + chunkManager storage.ChunkManager // Data manager *Manager @@ -33,6 +35,13 @@ func (suite *SegmentSuite) SetupSuite() { func (suite *SegmentSuite) SetupTest() { var err error + ctx := context.Background() + msgLength := 100 + + chunkManagerFactory := storage.NewChunkManagerFactoryWithParam(paramtable.Get()) + suite.chunkManager, _ = chunkManagerFactory.NewPersistentStorageChunkManager(ctx) + initcore.InitRemoteChunkManager(paramtable.Get()) + suite.collectionID = 100 suite.partitionID = 10 suite.segmentID = 1 @@ -63,6 +72,20 @@ func (suite *SegmentSuite) SetupTest() { ) suite.Require().NoError(err) + binlogs, _, err := SaveBinLog(ctx, + suite.collectionID, + suite.partitionID, + suite.segmentID, + msgLength, + schema, + suite.chunkManager, + ) + suite.Require().NoError(err) + for _, binlog := range binlogs { + err = suite.sealed.LoadFieldData(binlog.FieldID, int64(msgLength), binlog) + suite.Require().NoError(err) + } + suite.growing, err = NewSegment(suite.collection, suite.segmentID+1, suite.partitionID, @@ -75,19 +98,9 @@ func (suite *SegmentSuite) SetupTest() { ) suite.Require().NoError(err) - insertData, err := genInsertData(100, suite.collection.Schema()) + insertMsg, err := genInsertMsg(suite.collection, suite.partitionID, suite.growing.segmentID, msgLength) suite.Require().NoError(err) - insertRecord, err := storage.TransferInsertDataToInsertRecord(insertData) - suite.Require().NoError(err) - numRows := insertRecord.NumRows - for _, fieldData := range insertRecord.FieldsData { - err = suite.sealed.LoadField(numRows, fieldData) - suite.Require().NoError(err) - } - - insertMsg, err := genInsertMsg(suite.collection, suite.partitionID, suite.growing.segmentID, 100) - suite.Require().NoError(err) - insertRecord, err = storage.TransferInsertMsgToInsertRecord(suite.collection.Schema(), insertMsg) + insertRecord, err := storage.TransferInsertMsgToInsertRecord(suite.collection.Schema(), insertMsg) suite.Require().NoError(err) err = suite.growing.Insert(insertMsg.RowIDs, insertMsg.Timestamps, insertRecord) suite.Require().NoError(err) @@ -97,9 +110,11 @@ func (suite *SegmentSuite) SetupTest() { } func (suite *SegmentSuite) TearDownTest() { + ctx := context.Background() DeleteSegment(suite.sealed) DeleteSegment(suite.growing) DeleteCollection(suite.collection) + suite.chunkManager.RemoveWithPrefix(ctx, paramtable.Get().MinioCfg.RootPath.GetValue()) } func (suite *SegmentSuite) TestDelete() { diff --git a/internal/querynodev2/server.go b/internal/querynodev2/server.go index 9386e8d051..98744f9120 100644 --- a/internal/querynodev2/server.go +++ b/internal/querynodev2/server.go @@ -174,7 +174,7 @@ func (node *QueryNode) Register() error { } // InitSegcore set init params of segCore, such as chunckRows, SIMD type... -func (node *QueryNode) InitSegcore() { +func (node *QueryNode) InitSegcore() error { cEasyloggingYaml := C.CString(path.Join(paramtable.Get().BaseTable.GetConfigDir(), paramtable.DefaultEasyloggingYaml)) C.SegcoreInit(cEasyloggingYaml) C.free(unsafe.Pointer(cEasyloggingYaml)) @@ -211,7 +211,7 @@ func (node *QueryNode) InitSegcore() { C.InitCpuNum(cCPUNum) localDataRootPath := filepath.Join(paramtable.Get().LocalStorageCfg.Path.GetValue(), typeutil.QueryNodeRole) - initcore.InitLocalStorageConfig(localDataRootPath) + initcore.InitLocalChunkManager(localDataRootPath) mmapDirPath := paramtable.Get().QueryNodeCfg.MmapDirPath.GetValue() if len(mmapDirPath) > 0 { @@ -219,6 +219,7 @@ func (node *QueryNode) InitSegcore() { } initcore.InitTraceConfig(paramtable.Get()) + return initcore.InitRemoteChunkManager(paramtable.Get()) } // Init function init historical and streaming module to manage segments @@ -309,7 +310,12 @@ func (node *QueryNode) Init() error { // init pipeline manager node.pipelineManager = pipeline.NewManager(node.manager, node.tSafeManager, node.dispClient, node.delegators) - node.InitSegcore() + err = node.InitSegcore() + if err != nil { + log.Error("QueryNode init segcore failed", zap.Error(err)) + initError = err + return + } if paramtable.Get().QueryNodeCfg.GCEnabled.GetAsBool() { if paramtable.Get().QueryNodeCfg.GCHelperEnabled.GetAsBool() { action := func(GOGC uint32) { @@ -364,6 +370,9 @@ func (node *QueryNode) Stop() error { if node.dispClient != nil { node.dispClient.Close() } + + // safe stop + initcore.CleanRemoteChunkManager() }) return nil } diff --git a/internal/querynodev2/services_test.go b/internal/querynodev2/services_test.go index bf6edc346b..faa3996482 100644 --- a/internal/querynodev2/services_test.go +++ b/internal/querynodev2/services_test.go @@ -19,7 +19,6 @@ import ( "context" "encoding/json" "math/rand" - "os" "testing" "github.com/cockroachdb/errors" @@ -105,7 +104,9 @@ func (suite *ServiceSuite) SetupTest() { // init mock suite.factory = dependency.NewMockFactory(suite.T()) suite.msgStream = msgstream.NewMockMsgStream(suite.T()) - suite.chunkManagerFactory = storage.NewChunkManagerFactory("local", storage.RootPath("/tmp/milvus-test")) + // TODO:: cpp chunk manager not support local chunk manager + //suite.chunkManagerFactory = storage.NewChunkManagerFactory("local", storage.RootPath("/tmp/milvus-test")) + suite.chunkManagerFactory = storage.NewChunkManagerFactoryWithParam(paramtable.Get()) suite.factory.EXPECT().Init(mock.Anything).Return() suite.factory.EXPECT().NewPersistentStorageChunkManager(mock.Anything).Return(suite.chunkManagerFactory.NewPersistentStorageChunkManager(ctx)) @@ -131,9 +132,27 @@ func (suite *ServiceSuite) SetupTest() { } func (suite *ServiceSuite) TearDownTest() { + suite.node.UpdateStateCode(commonpb.StateCode_Healthy) + ctx := context.Background() + // ReleaseSegment, avoid throwing an instance of 'std::system_error' when stop node + resp, err := suite.node.ReleaseSegments(ctx, &querypb.ReleaseSegmentsRequest{ + Base: &commonpb.MsgBase{ + MsgType: commonpb.MsgType_ReleaseSegments, + TargetID: suite.node.session.ServerID, + }, + CollectionID: suite.collectionID, + PartitionIDs: suite.partitionIDs, + SegmentIDs: suite.validSegmentIDs, + NodeID: suite.node.session.ServerID, + Scope: querypb.DataScope_All, + Shard: suite.vchannel, + }) + suite.NoError(err) + suite.Equal(commonpb.ErrorCode_Success, resp.ErrorCode) + suite.node.vectorStorage.RemoveWithPrefix(ctx, paramtable.Get().MinioCfg.RootPath.GetValue()) + suite.node.Stop() suite.etcdClient.Close() - os.RemoveAll("/tmp/milvus-test") } func (suite *ServiceSuite) TestGetComponentStatesNormal() { @@ -467,6 +486,7 @@ func (suite *ServiceSuite) genSegmentLoadInfos(schema *schemapb.CollectionSchema PartitionID: suite.partitionIDs[i%partNum], CollectionID: suite.collectionID, InsertChannel: suite.vchannel, + NumOfRows: 1000, BinlogPaths: binlogs, Statslogs: statslogs, IndexInfos: []*querypb.FieldIndexInfo{indexes}, diff --git a/internal/util/dependency/factory.go b/internal/util/dependency/factory.go index 430856d718..fb6274e71f 100644 --- a/internal/util/dependency/factory.go +++ b/internal/util/dependency/factory.go @@ -44,6 +44,15 @@ func NewDefaultFactory(standAlone bool) *DefaultFactory { } } +// Only for test +func MockDefaultFactory(standAlone bool, params *paramtable.ComponentParam) *DefaultFactory { + return &DefaultFactory{ + standAlone: standAlone, + msgStreamFactory: smsgstream.NewRocksmqFactory("/tmp/milvus/rocksmq/"), + chunkManagerFactory: storage.NewChunkManagerFactoryWithParam(params), + } +} + // NewFactory creates a new instance of the DefaultFactory type. // If standAlone is true, the factory will operate in standalone mode. func NewFactory(standAlone bool) *DefaultFactory { diff --git a/internal/util/indexcgowrapper/build_index_info.go b/internal/util/indexcgowrapper/build_index_info.go new file mode 100644 index 0000000000..ae282b067c --- /dev/null +++ b/internal/util/indexcgowrapper/build_index_info.go @@ -0,0 +1,145 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package indexcgowrapper + +/* +#cgo pkg-config: milvus_indexbuilder +#include // free +#include "indexbuilder/index_c.h" +*/ +import "C" + +import ( + "fmt" + "unsafe" + + "github.com/golang/protobuf/proto" + + "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" + "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" + "github.com/milvus-io/milvus/internal/proto/indexcgopb" + "github.com/milvus-io/milvus/internal/proto/indexpb" +) + +type BuildIndexInfo struct { + cBuildIndexInfo C.CBuildIndexInfo +} + +func NewBuildIndexInfo(config *indexpb.StorageConfig) (*BuildIndexInfo, error) { + var cBuildIndexInfo C.CBuildIndexInfo + + cAddress := C.CString(config.Address) + cBucketName := C.CString(config.BucketName) + cAccessKey := C.CString(config.AccessKeyID) + cAccessValue := C.CString(config.SecretAccessKey) + cRootPath := C.CString(config.RootPath) + cStorageType := C.CString(config.StorageType) + cIamEndPoint := C.CString(config.IAMEndpoint) + defer C.free(unsafe.Pointer(cAddress)) + defer C.free(unsafe.Pointer(cBucketName)) + defer C.free(unsafe.Pointer(cAccessKey)) + defer C.free(unsafe.Pointer(cAccessValue)) + defer C.free(unsafe.Pointer(cRootPath)) + defer C.free(unsafe.Pointer(cStorageType)) + defer C.free(unsafe.Pointer(cIamEndPoint)) + storageConfig := C.CStorageConfig{ + address: cAddress, + bucket_name: cBucketName, + access_key_id: cAccessKey, + access_key_value: cAccessValue, + root_path: cRootPath, + storage_type: cStorageType, + iam_endpoint: cIamEndPoint, + useSSL: C.bool(config.UseSSL), + useIAM: C.bool(config.UseIAM), + } + + status := C.NewBuildIndexInfo(&cBuildIndexInfo, storageConfig) + if err := HandleCStatus(&status, "NewBuildIndexInfo failed"); err != nil { + return nil, err + } + return &BuildIndexInfo{cBuildIndexInfo: cBuildIndexInfo}, nil +} + +func DeleteBuildIndexInfo(info *BuildIndexInfo) { + C.DeleteBuildIndexInfo(info.cBuildIndexInfo) +} + +func (bi *BuildIndexInfo) AppendFieldMetaInfo(collectionID int64, partitionID int64, segmentID int64, fieldID int64, fieldType schemapb.DataType) error { + cColID := C.int64_t(collectionID) + cParID := C.int64_t(partitionID) + cSegID := C.int64_t(segmentID) + cFieldID := C.int64_t(fieldID) + cintDType := uint32(fieldType) + status := C.AppendFieldMetaInfo(bi.cBuildIndexInfo, cColID, cParID, cSegID, cFieldID, cintDType) + return HandleCStatus(&status, "appendFieldMetaInfo failed") +} + +func (bi *BuildIndexInfo) AppendIndexMetaInfo(indexID int64, buildID int64, indexVersion int64) error { + cIndexID := C.int64_t(indexID) + cBuildID := C.int64_t(buildID) + cIndexVersion := C.int64_t(indexVersion) + + status := C.AppendIndexMetaInfo(bi.cBuildIndexInfo, cIndexID, cBuildID, cIndexVersion) + return HandleCStatus(&status, "appendIndexMetaInfo failed") +} + +func (bi *BuildIndexInfo) AppendBuildIndexParam(indexParams map[string]string) error { + if len(indexParams) == 0 { + return nil + } + protoIndexParams := &indexcgopb.IndexParams{ + Params: make([]*commonpb.KeyValuePair, 0), + } + for key, value := range indexParams { + protoIndexParams.Params = append(protoIndexParams.Params, &commonpb.KeyValuePair{Key: key, Value: value}) + } + indexParamsBlob, err := proto.Marshal(protoIndexParams) + if err != nil { + return fmt.Errorf("failed to marshal index params: %s", err) + } + + status := C.AppendBuildIndexParam(bi.cBuildIndexInfo, (*C.uint8_t)(unsafe.Pointer(&indexParamsBlob[0])), (C.uint64_t)(len(indexParamsBlob))) + return HandleCStatus(&status, "appendBuildIndexParam failed") +} + +func (bi *BuildIndexInfo) AppendBuildTypeParam(typeParams map[string]string) error { + if len(typeParams) == 0 { + return nil + } + protoTypeParams := &indexcgopb.TypeParams{ + Params: make([]*commonpb.KeyValuePair, 0), + } + for key, value := range typeParams { + protoTypeParams.Params = append(protoTypeParams.Params, &commonpb.KeyValuePair{Key: key, Value: value}) + } + typeParamsBlob, err := proto.Marshal(protoTypeParams) + if err != nil { + return fmt.Errorf("failed to marshal type params: %s", err) + } + + status := C.AppendBuildTypeParam(bi.cBuildIndexInfo, (*C.uint8_t)(unsafe.Pointer(&typeParamsBlob[0])), (C.uint64_t)(len(typeParamsBlob))) + return HandleCStatus(&status, "appendBuildTypeParam failed") +} + +func (bi *BuildIndexInfo) AppendInsertFile(filePath string) error { + cInsertFilePath := C.CString(filePath) + defer C.free(unsafe.Pointer(cInsertFilePath)) + + status := C.AppendInsertFilePath(bi.cBuildIndexInfo, cInsertFilePath) + return HandleCStatus(&status, "appendInsertFile failed") +} diff --git a/internal/util/indexcgowrapper/codec_index_test.go b/internal/util/indexcgowrapper/codec_index_test.go index 659ddecfb0..86c8b8414e 100644 --- a/internal/util/indexcgowrapper/codec_index_test.go +++ b/internal/util/indexcgowrapper/codec_index_test.go @@ -304,7 +304,7 @@ func genStorageConfig() *indexpb.StorageConfig { func TestCgoIndex(t *testing.T) { for _, testCase := range genIndexCase() { - index, err := NewCgoIndex(testCase.dtype, testCase.typeParams, testCase.indexParams, genStorageConfig()) + index, err := NewCgoIndex(testCase.dtype, testCase.typeParams, testCase.indexParams) assert.NoError(t, err, testCase) dataset := GenDataset(genFieldData(testCase.dtype, nb, dim)) @@ -313,7 +313,7 @@ func TestCgoIndex(t *testing.T) { blobs, err := index.Serialize() assert.NoError(t, err, testCase) - copyIndex, err := NewCgoIndex(testCase.dtype, testCase.typeParams, testCase.indexParams, genStorageConfig()) + copyIndex, err := NewCgoIndex(testCase.dtype, testCase.typeParams, testCase.indexParams) assert.NoError(t, err, testCase) assert.NoError(t, copyIndex.Load(blobs), testCase) diff --git a/internal/util/indexcgowrapper/helper.go b/internal/util/indexcgowrapper/helper.go index f167ac849e..5de29422f4 100644 --- a/internal/util/indexcgowrapper/helper.go +++ b/internal/util/indexcgowrapper/helper.go @@ -79,11 +79,13 @@ func HandleCStatus(status *C.CStatus, extraInfo string) error { return errors.New(finalMsg) } -func GetLocalUsedSize() (int64, error) { +func GetLocalUsedSize(path string) (int64, error) { var availableSize int64 cSize := (*C.int64_t)(&availableSize) + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) - status := C.GetLocalUsedSize(cSize) + status := C.GetLocalUsedSize(cPath, cSize) err := HandleCStatus(&status, "get local used size failed") if err != nil { return 0, err diff --git a/internal/util/indexcgowrapper/index.go b/internal/util/indexcgowrapper/index.go index 1180ccae3b..b9b12fa35a 100644 --- a/internal/util/indexcgowrapper/index.go +++ b/internal/util/indexcgowrapper/index.go @@ -8,13 +8,12 @@ package indexcgowrapper */ import "C" import ( + "context" "fmt" "path/filepath" "runtime" "unsafe" - "github.com/milvus-io/milvus/internal/proto/indexpb" - "github.com/golang/protobuf/proto" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" @@ -39,6 +38,7 @@ type CodecIndex interface { Load([]*Blob) error Delete() error CleanLocalData() error + UpLoad() (map[string]int64, error) } var ( @@ -51,7 +51,7 @@ type CgoIndex struct { } // TODO: use proto.Marshal instead of proto.MarshalTextString for better compatibility. -func NewCgoIndex(dtype schemapb.DataType, typeParams, indexParams map[string]string, config *indexpb.StorageConfig) (CodecIndex, error) { +func NewCgoIndex(dtype schemapb.DataType, typeParams, indexParams map[string]string) (CodecIndex, error) { protoTypeParams := &indexcgopb.TypeParams{ Params: make([]*commonpb.KeyValuePair, 0), } @@ -73,37 +73,9 @@ func NewCgoIndex(dtype schemapb.DataType, typeParams, indexParams map[string]str defer C.free(unsafe.Pointer(typeParamsPointer)) defer C.free(unsafe.Pointer(indexParamsPointer)) - // TODO::xige-16 support embedded milvus - storageType := "minio" - cAddress := C.CString(config.Address) - cBucketName := C.CString(config.GetBucketName()) - cAccessKey := C.CString(config.GetAccessKeyID()) - cAccessValue := C.CString(config.GetSecretAccessKey()) - cRootPath := C.CString(config.GetRootPath()) - cStorageType := C.CString(storageType) - cIamEndPoint := C.CString(config.GetIAMEndpoint()) - defer C.free(unsafe.Pointer(cAddress)) - defer C.free(unsafe.Pointer(cBucketName)) - defer C.free(unsafe.Pointer(cAccessKey)) - defer C.free(unsafe.Pointer(cAccessValue)) - defer C.free(unsafe.Pointer(cRootPath)) - defer C.free(unsafe.Pointer(cStorageType)) - defer C.free(unsafe.Pointer(cIamEndPoint)) - storageConfig := C.CStorageConfig{ - address: cAddress, - bucket_name: cBucketName, - access_key_id: cAccessKey, - access_key_value: cAccessValue, - remote_root_path: cRootPath, - storage_type: cStorageType, - iam_endpoint: cIamEndPoint, - useSSL: C.bool(config.GetUseSSL()), - useIAM: C.bool(config.GetUseIAM()), - } - var indexPtr C.CIndex cintDType := uint32(dtype) - status := C.CreateIndex(cintDType, typeParamsPointer, indexParamsPointer, &indexPtr, storageConfig) + status := C.CreateIndex(cintDType, typeParamsPointer, indexParamsPointer, &indexPtr) if err := HandleCStatus(&status, "failed to create index"); err != nil { return nil, err } @@ -122,6 +94,21 @@ func NewCgoIndex(dtype schemapb.DataType, typeParams, indexParams map[string]str return index, nil } +func CreateIndex(ctx context.Context, buildIndexInfo *BuildIndexInfo) (CodecIndex, error) { + var indexPtr C.CIndex + status := C.CreateIndexV2(&indexPtr, buildIndexInfo.cBuildIndexInfo) + if err := HandleCStatus(&status, "failed to create index"); err != nil { + return nil, err + } + + index := &CgoIndex{ + indexPtr: indexPtr, + close: false, + } + + return index, nil +} + func (index *CgoIndex) Build(dataset *Dataset) error { switch dataset.DType { case schemapb.DataType_None: @@ -339,3 +326,38 @@ func (index *CgoIndex) CleanLocalData() error { status := C.CleanLocalData(index.indexPtr) return HandleCStatus(&status, "failed to clean cached data on disk") } + +func (index *CgoIndex) UpLoad() (map[string]int64, error) { + var cBinarySet C.CBinarySet + + status := C.SerializeIndexAndUpLoad(index.indexPtr, &cBinarySet) + defer func() { + if cBinarySet != nil { + C.DeleteBinarySet(cBinarySet) + } + }() + if err := HandleCStatus(&status, "failed to serialize index and upload index"); err != nil { + return nil, err + } + + res := make(map[string]int64) + indexFilePaths, err := GetBinarySetKeys(cBinarySet) + if err != nil { + return nil, err + } + for _, path := range indexFilePaths { + size, err := GetBinarySetSize(cBinarySet, path) + if err != nil { + return nil, err + } + res[path] = size + } + + runtime.SetFinalizer(index, func(index *CgoIndex) { + if index != nil && !index.close { + log.Error("there is leakage in index object, please check.") + } + }) + + return res, nil +} diff --git a/internal/util/indexcgowrapper/index_test.go b/internal/util/indexcgowrapper/index_test.go index a1088ba074..03c87f1658 100644 --- a/internal/util/indexcgowrapper/index_test.go +++ b/internal/util/indexcgowrapper/index_test.go @@ -119,7 +119,7 @@ func TestCIndex_New(t *testing.T) { for _, c := range generateTestCases() { typeParams, indexParams := generateParams(c.indexType, c.metricType) - index, err := NewCgoIndex(c.dtype, typeParams, indexParams, genStorageConfig()) + index, err := NewCgoIndex(c.dtype, typeParams, indexParams) assert.Equal(t, err, nil) assert.NotEqual(t, index, nil) @@ -132,7 +132,7 @@ func TestCIndex_BuildFloatVecIndex(t *testing.T) { for _, c := range generateFloatVectorTestCases() { typeParams, indexParams := generateParams(c.indexType, c.metricType) - index, err := NewCgoIndex(c.dtype, typeParams, indexParams, genStorageConfig()) + index, err := NewCgoIndex(c.dtype, typeParams, indexParams) assert.Equal(t, err, nil) assert.NotEqual(t, index, nil) @@ -149,7 +149,7 @@ func TestCIndex_BuildBinaryVecIndex(t *testing.T) { for _, c := range generateBinaryVectorTestCases() { typeParams, indexParams := generateParams(c.indexType, c.metricType) - index, err := NewCgoIndex(c.dtype, typeParams, indexParams, genStorageConfig()) + index, err := NewCgoIndex(c.dtype, typeParams, indexParams) assert.Equal(t, err, nil) assert.NotEqual(t, index, nil) @@ -166,7 +166,7 @@ func TestCIndex_Codec(t *testing.T) { for _, c := range generateTestCases() { typeParams, indexParams := generateParams(c.indexType, c.metricType) - index, err := NewCgoIndex(c.dtype, typeParams, indexParams, genStorageConfig()) + index, err := NewCgoIndex(c.dtype, typeParams, indexParams) assert.Equal(t, err, nil) assert.NotEqual(t, index, nil) @@ -183,7 +183,7 @@ func TestCIndex_Codec(t *testing.T) { blobs, err := index.Serialize() assert.Equal(t, err, nil) - copyIndex, err := NewCgoIndex(c.dtype, typeParams, indexParams, genStorageConfig()) + copyIndex, err := NewCgoIndex(c.dtype, typeParams, indexParams) assert.NotEqual(t, copyIndex, nil) assert.Equal(t, err, nil) err = copyIndex.Load(blobs) @@ -205,7 +205,7 @@ func TestCIndex_Delete(t *testing.T) { for _, c := range generateTestCases() { typeParams, indexParams := generateParams(c.indexType, c.metricType) - index, err := NewCgoIndex(c.dtype, typeParams, indexParams, genStorageConfig()) + index, err := NewCgoIndex(c.dtype, typeParams, indexParams) assert.Equal(t, err, nil) assert.NotEqual(t, index, nil) @@ -218,7 +218,7 @@ func TestCIndex_Error(t *testing.T) { indexParams := make(map[string]string) indexParams[common.IndexTypeKey] = "IVF_FLAT" indexParams[common.MetricTypeKey] = "L2" - indexPtr, err := NewCgoIndex(schemapb.DataType_FloatVector, nil, indexParams, genStorageConfig()) + indexPtr, err := NewCgoIndex(schemapb.DataType_FloatVector, nil, indexParams) assert.NoError(t, err) t.Run("Serialize error", func(t *testing.T) { diff --git a/internal/util/initcore/init_core.go b/internal/util/initcore/init_core.go index 0c601532d0..241f8ef5a4 100644 --- a/internal/util/initcore/init_core.go +++ b/internal/util/initcore/init_core.go @@ -17,24 +17,30 @@ package initcore /* -#cgo pkg-config: milvus_common +#cgo pkg-config: milvus_common milvus_storage #include #include #include "common/init_c.h" +#include "storage/storage_c.h" */ import "C" import ( + "fmt" "unsafe" + "github.com/cockroachdb/errors" + + "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" + "github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/util/paramtable" ) -func InitLocalStorageConfig(path string) { +func InitLocalChunkManager(path string) { CLocalRootPath := C.CString(path) - C.InitLocalRootPath(CLocalRootPath) - C.free(unsafe.Pointer(CLocalRootPath)) + defer C.free(unsafe.Pointer(CLocalRootPath)) + C.InitLocalChunkManagerSingleton(CLocalRootPath) } func InitTraceConfig(params *paramtable.ComponentParam) { @@ -47,3 +53,57 @@ func InitTraceConfig(params *paramtable.ComponentParam) { } C.InitTrace(&config) } + +func InitRemoteChunkManager(params *paramtable.ComponentParam) error { + cAddress := C.CString(params.MinioCfg.Address.GetValue()) + cBucketName := C.CString(params.MinioCfg.BucketName.GetValue()) + cAccessKey := C.CString(params.MinioCfg.AccessKeyID.GetValue()) + cAccessValue := C.CString(params.MinioCfg.SecretAccessKey.GetValue()) + cRootPath := C.CString(params.MinioCfg.RootPath.GetValue()) + cStorageType := C.CString(params.CommonCfg.StorageType.GetValue()) + cIamEndPoint := C.CString(params.MinioCfg.IAMEndpoint.GetValue()) + defer C.free(unsafe.Pointer(cAddress)) + defer C.free(unsafe.Pointer(cBucketName)) + defer C.free(unsafe.Pointer(cAccessKey)) + defer C.free(unsafe.Pointer(cAccessValue)) + defer C.free(unsafe.Pointer(cRootPath)) + defer C.free(unsafe.Pointer(cStorageType)) + defer C.free(unsafe.Pointer(cIamEndPoint)) + storageConfig := C.CStorageConfig{ + address: cAddress, + bucket_name: cBucketName, + access_key_id: cAccessKey, + access_key_value: cAccessValue, + root_path: cRootPath, + storage_type: cStorageType, + iam_endpoint: cIamEndPoint, + useSSL: C.bool(params.MinioCfg.UseSSL.GetAsBool()), + useIAM: C.bool(params.MinioCfg.UseIAM.GetAsBool()), + } + + status := C.InitRemoteChunkManagerSingleton(storageConfig) + return HandleCStatus(&status, "InitRemoteChunkManagerSingleton failed") +} + +func CleanRemoteChunkManager() { + C.CleanRemoteChunkManagerSingleton() +} + +// HandleCStatus deals with the error returned from CGO +func HandleCStatus(status *C.CStatus, extraInfo string) error { + if status.error_code == 0 { + return nil + } + errorCode := status.error_code + errorName, ok := commonpb.ErrorCode_name[int32(errorCode)] + if !ok { + errorName = "UnknownError" + } + errorMsg := C.GoString(status.error_msg) + defer C.free(unsafe.Pointer(status.error_msg)) + + finalMsg := fmt.Sprintf("[%s] %s", errorName, errorMsg) + logMsg := fmt.Sprintf("%s, C Runtime Exception: %s\n", extraInfo, finalMsg) + log.Warn(logMsg) + return errors.New(finalMsg) +} diff --git a/pkg/util/indexparams/disk_index_params.go b/pkg/util/indexparams/disk_index_params.go index a06e27bb08..d9dee23d6c 100644 --- a/pkg/util/indexparams/disk_index_params.go +++ b/pkg/util/indexparams/disk_index_params.go @@ -189,17 +189,7 @@ func FillDiskIndexParams(params *paramtable.ComponentParam, indexParams map[stri // SetDiskIndexBuildParams set index build params with ratio params on indexNode // IndexNode cal build param with ratio params and cpu count, memory count... -func SetDiskIndexBuildParams(indexParams map[string]string, numRows int64) error { - dimStr, ok := indexParams[common.DimKey] - if !ok { - // type param dim has been put into index params before build index - return fmt.Errorf("type param dim not exist") - } - dim, err := strconv.ParseInt(dimStr, 10, 64) - if err != nil { - return err - } - +func SetDiskIndexBuildParams(indexParams map[string]string, fieldDataSize int64) error { pqCodeBudgetGBRatioStr, ok := indexParams[PQCodeBudgetRatioKey] if !ok { return fmt.Errorf("index param pqCodeBudgetGBRatio not exist") @@ -216,6 +206,7 @@ func SetDiskIndexBuildParams(indexParams map[string]string, numRows int64) error if err != nil { return err } + searchCacheBudgetGBRatioStr, ok := indexParams[SearchCacheBudgetRatioKey] if !ok { return fmt.Errorf("index param searchCacheBudgetGBRatio not exist") @@ -224,12 +215,10 @@ func SetDiskIndexBuildParams(indexParams map[string]string, numRows int64) error if err != nil { return err } - indexParams[PQCodeBudgetKey] = fmt.Sprintf("%f", - float32(getRowDataSizeOfFloatVector(numRows, dim))*float32(pqCodeBudgetGBRatio)/(1<<30)) + indexParams[PQCodeBudgetKey] = fmt.Sprintf("%f", float32(fieldDataSize)*float32(pqCodeBudgetGBRatio)/(1<<30)) indexParams[NumBuildThreadKey] = strconv.Itoa(int(float32(hardware.GetCPUNum()) * float32(buildNumThreadsRatio))) indexParams[BuildDramBudgetKey] = fmt.Sprintf("%f", float32(hardware.GetFreeMemoryCount())/(1<<30)) - indexParams[SearchCacheBudgetKey] = fmt.Sprintf("%f", - float32(getRowDataSizeOfFloatVector(numRows, dim))*float32(SearchCacheBudgetGBRatio)/(1<<30)) + indexParams[SearchCacheBudgetKey] = fmt.Sprintf("%f", float32(fieldDataSize)*float32(SearchCacheBudgetGBRatio)/(1<<30)) return nil } diff --git a/pkg/util/indexparams/disk_index_params_test.go b/pkg/util/indexparams/disk_index_params_test.go index 07f08ad802..4f7f9af2ce 100644 --- a/pkg/util/indexparams/disk_index_params_test.go +++ b/pkg/util/indexparams/disk_index_params_test.go @@ -128,15 +128,8 @@ func TestDiskIndexParams(t *testing.T) { indexParams[PQCodeBudgetRatioKey] = "0.125" indexParams[NumBuildThreadRatioKey] = "1.0" - err := SetDiskIndexBuildParams(indexParams, 100) - assert.Error(t, err) - - indexParams[common.DimKey] = "128" - err = SetDiskIndexBuildParams(indexParams, 100) - assert.Error(t, err) - indexParams[SearchCacheBudgetRatioKey] = "0.125" - err = SetDiskIndexBuildParams(indexParams, 100) + err := SetDiskIndexBuildParams(indexParams, 100) assert.NoError(t, err) indexParams[SearchCacheBudgetRatioKey] = "aabb" diff --git a/tests/integration/minicluster.go b/tests/integration/minicluster.go index 1f326c0ba2..04c37d9e2e 100644 --- a/tests/integration/minicluster.go +++ b/tests/integration/minicluster.go @@ -140,6 +140,9 @@ func StartMiniCluster(ctx context.Context, opts ...Option) (cluster *MiniCluster })) if cluster.factory == nil { + params.Save(params.LocalStorageCfg.Path.Key, "/tmp/milvus/") + params.Save(params.CommonCfg.StorageType.Key, "local") + params.Save(params.MinioCfg.RootPath.Key, "/tmp/milvus/") cluster.factory = dependency.NewDefaultFactory(true) chunkManager, err := cluster.factory.NewPersistentStorageChunkManager(cluster.ctx) if err != nil {