Migrate the ability to upload and download binlog to cpp (#22984)

Signed-off-by: xige-16 <xi.ge@zilliz.com>
This commit is contained in:
xige-16 2023-06-25 14:38:44 +08:00 committed by GitHub
parent 6a1eff3487
commit 04082b3de2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
150 changed files with 4648 additions and 3095 deletions

View File

@ -19,27 +19,27 @@
namespace milvus {
int64_t index_file_slice_size = DEFAULT_INDEX_FILE_SLICE_SIZE;
int64_t thread_core_coefficient = DEFAULT_THREAD_CORE_COEFFICIENT;
int cpu_num = DEFAULT_CPU_NUM;
int64_t FILE_SLICE_SIZE = DEFAULT_INDEX_FILE_SLICE_SIZE;
int64_t THREAD_CORE_COEFFICIENT = DEFAULT_THREAD_CORE_COEFFICIENT;
int CPU_NUM = DEFAULT_CPU_NUM;
void
SetIndexSliceSize(const int64_t size) {
index_file_slice_size = size;
LOG_SEGCORE_DEBUG_ << "set config index slice size: "
<< index_file_slice_size;
FILE_SLICE_SIZE = size << 20;
LOG_SEGCORE_DEBUG_ << "set config index slice size (byte): "
<< FILE_SLICE_SIZE;
}
void
SetThreadCoreCoefficient(const int64_t coefficient) {
thread_core_coefficient = coefficient;
THREAD_CORE_COEFFICIENT = coefficient;
LOG_SEGCORE_DEBUG_ << "set thread pool core coefficient: "
<< thread_core_coefficient;
<< THREAD_CORE_COEFFICIENT;
}
void
SetCpuNum(const int num) {
cpu_num = num;
CPU_NUM = num;
}
} // namespace milvus

View File

@ -21,9 +21,9 @@
namespace milvus {
extern int64_t index_file_slice_size;
extern int64_t thread_core_coefficient;
extern int cpu_num;
extern int64_t FILE_SLICE_SIZE;
extern int64_t THREAD_CORE_COEFFICIENT;
extern int CPU_NUM;
void
SetIndexSliceSize(const int64_t size);

View File

@ -39,10 +39,10 @@ const char INDEX_BUILD_ID_KEY[] = "indexBuildID";
const char INDEX_ROOT_PATH[] = "index_files";
const char RAWDATA_ROOT_PATH[] = "raw_datas";
const int64_t DEFAULT_DISK_INDEX_MAX_MEMORY_LIMIT = 67108864; // bytes
const int64_t DEFAULT_FIELD_MAX_MEMORY_LIMIT = 67108864; // bytes
const int64_t DEFAULT_THREAD_CORE_COEFFICIENT = 50;
const int64_t DEFAULT_INDEX_FILE_SLICE_SIZE = 4; // megabytes
const int64_t DEFAULT_INDEX_FILE_SLICE_SIZE = 4194304; // bytes
const int DEFAULT_CPU_NUM = 1;

View File

@ -18,19 +18,24 @@
#include <map>
#include <string>
#include <vector>
#include "Types.h"
#include "common/CDataType.h"
// NOTE: field_id can be system field
// NOTE: Refer to common/SystemProperty.cpp for details
// TODO: use arrow to pass field data instead of proto
struct LoadFieldDataInfo {
struct FieldBinlogInfo {
int64_t field_id;
// const void* blob = nullptr;
const milvus::DataArray* field_data;
int64_t row_count{-1};
const char* mmap_dir_path{nullptr};
int64_t row_count = -1;
std::vector<std::string> insert_files;
};
struct LoadFieldDataInfo {
std::map<int64_t, FieldBinlogInfo> field_infos;
// Set null to disable mmap,
// mmap file path will be {mmap_dir_path}/{segment_id}/{field_id}
std::string mmap_dir_path = "";
};
struct LoadDeletedRecordInfo {

View File

@ -20,11 +20,10 @@
namespace milvus {
static const char* INDEX_FILE_SLICE_META = "SLICE_META";
static const char* META = "meta";
static const char* NAME = "name";
static const char* SLICE_NUM = "slice_num";
static const char* TOTAL_LEN = "total_len";
std::string
GenSlicedFileName(const std::string& prefix, size_t slice_num) {
return prefix + "_" + std::to_string(slice_num);
}
void
Slice(const std::string& prefix,
@ -42,8 +41,7 @@ Slice(const std::string& prefix,
auto size = static_cast<size_t>(ri - i);
auto slice_i = std::shared_ptr<uint8_t[]>(new uint8_t[size]);
memcpy(slice_i.get(), data_src->data.get() + i, size);
binarySet.Append(
prefix + "_" + std::to_string(slice_num), slice_i, ri - i);
binarySet.Append(GenSlicedFileName(prefix, slice_num), slice_i, ri - i);
i = ri;
}
ret[NAME] = prefix;
@ -68,7 +66,7 @@ Assemble(BinarySet& binarySet) {
auto p_data = std::shared_ptr<uint8_t[]>(new uint8_t[total_len]);
int64_t pos = 0;
for (auto i = 0; i < slice_num; ++i) {
auto slice_i_sp = binarySet.Erase(prefix + "_" + std::to_string(i));
auto slice_i_sp = binarySet.Erase(GenSlicedFileName(prefix, i));
memcpy(p_data.get() + pos,
slice_i_sp->data.get(),
static_cast<size_t>(slice_i_sp->size));
@ -90,17 +88,15 @@ Disassemble(BinarySet& binarySet) {
}
}
const int64_t slice_size_in_byte = index_file_slice_size << 20;
std::vector<std::string> slice_key_list;
for (auto& kv : binarySet.binary_map_) {
if (kv.second->size > slice_size_in_byte) {
if (kv.second->size > FILE_SLICE_SIZE) {
slice_key_list.push_back(kv.first);
}
}
for (auto& key : slice_key_list) {
Config slice_i;
Slice(
key, binarySet.Erase(key), slice_size_in_byte, binarySet, slice_i);
Slice(key, binarySet.Erase(key), FILE_SLICE_SIZE, binarySet, slice_i);
meta_info[META].emplace_back(slice_i);
}
if (!slice_key_list.empty()) {

View File

@ -20,6 +20,16 @@
namespace milvus {
// used for disassemble and assemble index data
const char INDEX_FILE_SLICE_META[] = "SLICE_META";
const char META[] = "meta";
const char NAME[] = "name";
const char SLICE_NUM[] = "slice_num";
const char TOTAL_LEN[] = "total_len";
std::string
GenSlicedFileName(const std::string& prefix, size_t slice_num);
void
Assemble(BinarySet& binarySet);

View File

@ -28,7 +28,6 @@
#include "common/FieldMeta.h"
#include "common/LoadInfo.h"
#include "common/Types.h"
#include "config/ConfigChunkManager.h"
#include "exceptions/EasyAssert.h"
#include "knowhere/dataset.h"
#include "knowhere/expected.h"
@ -209,263 +208,24 @@ MatchKnowhereError(knowhere::Status status) {
}
}
inline size_t
GetDataSize(const FieldMeta& field, size_t row_count, const DataArray* data) {
auto data_type = field.get_data_type();
if (datatype_is_variable(data_type)) {
switch (data_type) {
case DataType::VARCHAR:
case DataType::STRING: {
ssize_t size{};
for (auto& data : FIELD_DATA(data, string)) {
size += data.size();
}
return size;
}
case DataType::JSON: {
ssize_t size{};
for (auto& data : FIELD_DATA(data, json)) {
size += data.size();
}
return size;
}
default:
PanicInfo(fmt::format("not supported data type {}",
datatype_name(data_type)));
}
}
return field.get_sizeof() * row_count;
inline std::vector<IndexType>
DISK_INDEX_LIST() {
static std::vector<IndexType> ret{
knowhere::IndexEnum::INDEX_DISKANN,
};
return ret;
}
inline void*
FillField(DataType data_type,
size_t size,
const LoadFieldDataInfo& info,
void* dst) {
auto data = info.field_data;
switch (data_type) {
case DataType::BOOL: {
return memcpy(dst, FIELD_DATA(data, bool).data(), size);
}
case DataType::INT8: {
auto src_data = FIELD_DATA(data, int);
std::vector<int8_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return memcpy(dst, data_raw.data(), size);
}
case DataType::INT16: {
auto src_data = FIELD_DATA(data, int);
std::vector<int16_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return memcpy(dst, data_raw.data(), size);
}
case DataType::INT32: {
return memcpy(dst, FIELD_DATA(data, int).data(), size);
}
case DataType::INT64: {
return memcpy(dst, FIELD_DATA(data, long).data(), size);
}
case DataType::FLOAT: {
return memcpy(dst, FIELD_DATA(data, float).data(), size);
}
case DataType::DOUBLE: {
return memcpy(dst, FIELD_DATA(data, double).data(), size);
}
case DataType::VARCHAR: {
char* dest = reinterpret_cast<char*>(dst);
for (auto& data : FIELD_DATA(data, string)) {
memcpy(dest, data.data(), data.size());
dest += data.size();
}
return dst;
}
case DataType::JSON: {
char* dest = reinterpret_cast<char*>(dst);
for (auto& data : FIELD_DATA(data, json)) {
memcpy(dest, data.data(), data.size());
dest += data.size();
}
return dst;
}
case DataType::VECTOR_FLOAT:
return memcpy(dst, VEC_FIELD_DATA(data, float).data(), size);
case DataType::VECTOR_BINARY:
return memcpy(dst, VEC_FIELD_DATA(data, binary), size);
default: {
PanicInfo("unsupported");
}
}
template <typename T>
inline bool
is_in_list(const T& t, std::function<std::vector<T>()> list_func) {
auto l = list_func();
return std::find(l.begin(), l.end(), t) != l.end();
}
inline ssize_t
WriteFieldData(int fd, DataType data_type, const DataArray* data, size_t size) {
switch (data_type) {
case DataType::BOOL: {
return write(fd, FIELD_DATA(data, bool).data(), size);
}
case DataType::INT8: {
auto src_data = FIELD_DATA(data, int);
std::vector<int8_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return write(fd, data_raw.data(), size);
}
case DataType::INT16: {
auto src_data = FIELD_DATA(data, int);
std::vector<int16_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return write(fd, data_raw.data(), size);
}
case DataType::INT32: {
return write(fd, FIELD_DATA(data, int).data(), size);
}
case DataType::INT64: {
return write(fd, FIELD_DATA(data, long).data(), size);
}
case DataType::FLOAT: {
return write(fd, FIELD_DATA(data, float).data(), size);
}
case DataType::DOUBLE: {
return write(fd, FIELD_DATA(data, double).data(), size);
}
case DataType::VARCHAR: {
ssize_t total_written{0};
for (auto& str : FIELD_DATA(data, string)) {
ssize_t written = write(fd, str.data(), str.size());
if (written < str.size()) {
break;
}
total_written += written;
}
return total_written;
}
case DataType::JSON: {
ssize_t total_written{0};
for (auto& json : FIELD_DATA(data, json)) {
ssize_t written = write(fd, json.data(), json.size());
if (written < json.size()) {
break;
}
total_written += written;
}
return total_written;
}
case DataType::VECTOR_FLOAT:
return write(fd, VEC_FIELD_DATA(data, float).data(), size);
case DataType::VECTOR_BINARY:
return write(fd, VEC_FIELD_DATA(data, binary), size);
default: {
PanicInfo("unsupported");
}
}
}
// CreateMap creates a memory mapping,
// if mmap enabled, this writes field data to disk and create a map to the file,
// otherwise this just alloc memory
inline void*
CreateMap(int64_t segment_id,
const FieldMeta& field_meta,
const LoadFieldDataInfo& info) {
static int mmap_flags = MAP_PRIVATE;
#ifdef MAP_POPULATE
// macOS doesn't support MAP_POPULATE
mmap_flags |= MAP_POPULATE;
#endif
// simdjson requires a padding following the json data
size_t padding = field_meta.get_data_type() == DataType::JSON
? simdjson::SIMDJSON_PADDING
: 0;
// Allocate memory
if (info.mmap_dir_path == nullptr) {
auto data_type = field_meta.get_data_type();
auto data_size =
GetDataSize(field_meta, info.row_count, info.field_data);
if (data_size == 0)
return nullptr;
// Use anon mapping so we are able to free these memory with munmap only
void* map = mmap(nullptr,
data_size + padding,
PROT_READ | PROT_WRITE,
mmap_flags | MAP_ANON,
-1,
0);
AssertInfo(
map != MAP_FAILED,
fmt::format("failed to create anon map, err: {}", strerror(errno)));
FillField(data_type, data_size, info, map);
return map;
}
auto filepath = std::filesystem::path(info.mmap_dir_path) /
std::to_string(segment_id) / std::to_string(info.field_id);
auto dir = filepath.parent_path();
std::filesystem::create_directories(dir);
int fd =
open(filepath.c_str(), O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR);
AssertInfo(fd != -1,
fmt::format("failed to create mmap file {}", filepath.c_str()));
auto data_type = field_meta.get_data_type();
size_t size = field_meta.get_sizeof() * info.row_count;
auto written = WriteFieldData(fd, data_type, info.field_data, size);
AssertInfo(
written == size ||
written != -1 && datatype_is_variable(field_meta.get_data_type()),
fmt::format(
"failed to write data file {}, written {} but total {}, err: {}",
filepath.c_str(),
written,
size,
strerror(errno)));
int ok = fsync(fd);
AssertInfo(ok == 0,
fmt::format("failed to fsync mmap data file {}, err: {}",
filepath.c_str(),
strerror(errno)));
// Empty field
if (written == 0) {
return nullptr;
}
auto map = mmap(nullptr, written + padding, PROT_READ, mmap_flags, fd, 0);
AssertInfo(map != MAP_FAILED,
fmt::format("failed to create map for data file {}, err: {}",
filepath.c_str(),
strerror(errno)));
#ifndef MAP_POPULATE
// Manually access the mapping to populate it
const size_t page_size = getpagesize();
char* begin = (char*)map;
char* end = begin + written;
for (char* page = begin; page < end; page += page_size) {
char value = page[0];
}
#endif
// unlink this data file so
// then it will be auto removed after we don't need it again
ok = unlink(filepath.c_str());
AssertInfo(ok == 0,
fmt::format("failed to unlink mmap data file {}, err: {}",
filepath.c_str(),
strerror(errno)));
ok = close(fd);
AssertInfo(ok == 0,
fmt::format("failed to close data file {}, err: {}",
filepath.c_str(),
strerror(errno)));
return map;
inline bool
is_in_disk_list(const IndexType& index_type) {
return is_in_list<IndexType>(index_type, DISK_INDEX_LIST);
}
} // namespace milvus

View File

@ -20,36 +20,24 @@
#include "common/init_c.h"
#include <string>
#include "config/ConfigChunkManager.h"
#include "common/Slice.h"
#include "common/Common.h"
#include "common/Tracer.h"
#include "log/Log.h"
std::once_flag flag1, flag2, flag3, flag4;
std::once_flag flag1, flag2, flag3;
std::once_flag traceFlag;
void
InitLocalRootPath(const char* root_path) {
std::string local_path_root(root_path);
std::call_once(
flag1,
[](std::string path) {
milvus::ChunkMangerConfig::SetLocalRootPath(path);
},
local_path_root);
}
void
InitIndexSliceSize(const int64_t size) {
std::call_once(
flag2, [](int64_t size) { milvus::SetIndexSliceSize(size); }, size);
flag1, [](int64_t size) { milvus::SetIndexSliceSize(size); }, size);
}
void
InitThreadCoreCoefficient(const int64_t value) {
std::call_once(
flag3,
flag2,
[](int64_t value) { milvus::SetThreadCoreCoefficient(value); },
value);
}
@ -57,7 +45,7 @@ InitThreadCoreCoefficient(const int64_t value) {
void
InitCpuNum(const int value) {
std::call_once(
flag4, [](int value) { milvus::SetCpuNum(value); }, value);
flag3, [](int value) { milvus::SetCpuNum(value); }, value);
}
void

View File

@ -33,9 +33,6 @@ InitThreadCoreCoefficient(const int64_t);
void
InitCpuNum(const int);
void
InitLocalRootPath(const char*);
void
InitTrace(CTraceConfig* config);

View File

@ -69,16 +69,6 @@ typedef struct CProto {
int64_t proto_size;
} CProto;
typedef struct CLoadFieldDataInfo {
int64_t field_id;
const uint8_t* blob;
uint64_t blob_size;
int64_t row_count;
// Set null to disable mmap,
// mmap file path will be {mmap_dir_path}/{segment_id}/{field_id}
const char* mmap_dir_path;
} CLoadFieldDataInfo;
typedef struct CLoadDeletedRecordInfo {
void* timestamps;
const uint8_t* primary_keys;
@ -91,7 +81,7 @@ typedef struct CStorageConfig {
const char* bucket_name;
const char* access_key_id;
const char* access_key_value;
const char* remote_root_path;
const char* root_path;
const char* storage_type;
const char* iam_endpoint;
bool useSSL;

View File

@ -22,7 +22,6 @@ endif()
set(CONFIG_SRC
ConfigKnowhere.cpp
ConfigChunkManager.cpp
)
add_library(milvus_config STATIC ${CONFIG_SRC})

View File

@ -22,12 +22,10 @@
namespace milvus::index {
//// TODO: optimize here.
class BoolIndex : public ScalarIndexSort<bool> {};
using BoolIndexPtr = std::shared_ptr<BoolIndex>;
using BoolIndexPtr = std::shared_ptr<ScalarIndexSort<bool>>;
inline BoolIndexPtr
CreateBoolIndex() {
return std::make_unique<BoolIndex>();
CreateBoolIndex(storage::FileManagerImplPtr file_manager = nullptr) {
return std::make_unique<ScalarIndexSort<bool>>(file_manager);
}
} // namespace milvus::index

View File

@ -33,6 +33,9 @@ class IndexBase {
virtual void
Load(const BinarySet& binary_set, const Config& config = {}) = 0;
virtual void
Load(const Config& config = {}) = 0;
virtual void
BuildWithRawData(size_t n,
const void* values,
@ -41,9 +44,15 @@ class IndexBase {
virtual void
BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) = 0;
virtual void
Build(const Config& config = {}) = 0;
virtual int64_t
Count() = 0;
virtual BinarySet
Upload(const Config& config = {}) = 0;
protected:
IndexType index_type_ = "";
};

View File

@ -23,8 +23,9 @@ namespace milvus::index {
template <typename T>
inline ScalarIndexPtr<T>
IndexFactory::CreateScalarIndex(const IndexType& index_type) {
return CreateScalarIndexSort<T>();
IndexFactory::CreateScalarIndex(const IndexType& index_type,
storage::FileManagerImplPtr file_manager) {
return CreateScalarIndexSort<T>(file_manager);
}
// template <>
@ -35,9 +36,10 @@ IndexFactory::CreateScalarIndex(const IndexType& index_type) {
template <>
inline ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex(const IndexType& index_type) {
IndexFactory::CreateScalarIndex(const IndexType& index_type,
storage::FileManagerImplPtr file_manager) {
#if defined(__linux__) || defined(__APPLE__)
return CreateStringIndexMarisa();
return CreateStringIndexMarisa(file_manager);
#else
throw std::runtime_error("unsupported platform");
#endif

View File

@ -33,35 +33,36 @@ IndexFactory::CreateIndex(const CreateIndexInfo& create_index_info,
return CreateVectorIndex(create_index_info, file_manager);
}
return CreateScalarIndex(create_index_info);
return CreateScalarIndex(create_index_info, file_manager);
}
IndexBasePtr
IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info) {
IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info,
storage::FileManagerImplPtr file_manager) {
auto data_type = create_index_info.field_type;
auto index_type = create_index_info.index_type;
switch (data_type) {
// create scalar index
case DataType::BOOL:
return CreateScalarIndex<bool>(index_type);
return CreateScalarIndex<bool>(index_type, file_manager);
case DataType::INT8:
return CreateScalarIndex<int8_t>(index_type);
return CreateScalarIndex<int8_t>(index_type, file_manager);
case DataType::INT16:
return CreateScalarIndex<int16_t>(index_type);
return CreateScalarIndex<int16_t>(index_type, file_manager);
case DataType::INT32:
return CreateScalarIndex<int32_t>(index_type);
return CreateScalarIndex<int32_t>(index_type, file_manager);
case DataType::INT64:
return CreateScalarIndex<int64_t>(index_type);
return CreateScalarIndex<int64_t>(index_type, file_manager);
case DataType::FLOAT:
return CreateScalarIndex<float>(index_type);
return CreateScalarIndex<float>(index_type, file_manager);
case DataType::DOUBLE:
return CreateScalarIndex<double>(index_type);
return CreateScalarIndex<double>(index_type, file_manager);
// create string index
case DataType::STRING:
case DataType::VARCHAR:
return CreateScalarIndex<std::string>(index_type);
return CreateScalarIndex<std::string>(index_type, file_manager);
default:
throw std::invalid_argument(
std::string("invalid data type to build index: ") +
@ -93,10 +94,12 @@ IndexFactory::CreateVectorIndex(const CreateIndexInfo& create_index_info,
#endif
if (is_in_nm_list(index_type)) {
return std::make_unique<VectorMemNMIndex>(index_type, metric_type);
return std::make_unique<VectorMemNMIndex>(
index_type, metric_type, file_manager);
}
// create mem index
return std::make_unique<VectorMemIndex>(index_type, metric_type);
return std::make_unique<VectorMemIndex>(
index_type, metric_type, file_manager);
}
} // namespace milvus::index

View File

@ -21,7 +21,6 @@
#include <shared_mutex>
#include "common/type_c.h"
#include "config/ConfigChunkManager.h"
#include "index/Index.h"
#include "index/ScalarIndex.h"
#include "index/VectorIndex.h"
@ -29,11 +28,6 @@
#include "storage/Types.h"
#include "storage/FileManager.h"
#ifdef BUILD_DISK_ANN
#include "storage/LocalChunkManager.h"
#include "storage/MinioChunkManager.h"
#endif
namespace milvus::index {
class IndexFactory {
@ -61,14 +55,16 @@ class IndexFactory {
storage::FileManagerImplPtr file_manager);
IndexBasePtr
CreateScalarIndex(const CreateIndexInfo& create_index_info);
CreateScalarIndex(const CreateIndexInfo& create_index_info,
storage::FileManagerImplPtr file_manager = nullptr);
// IndexBasePtr
// CreateIndex(DataType dtype, const IndexType& index_type);
private:
template <typename T>
ScalarIndexPtr<T>
CreateScalarIndex(const IndexType& index_type);
CreateScalarIndex(const IndexType& index_type,
storage::FileManagerImplPtr file_manager = nullptr);
};
} // namespace milvus::index

View File

@ -24,22 +24,64 @@
#include "Meta.h"
#include "common/Utils.h"
#include "common/Slice.h"
#include "index/Utils.h"
namespace milvus::index {
template <typename T>
inline ScalarIndexSort<T>::ScalarIndexSort() : is_built_(false), data_() {
}
template <typename T>
inline ScalarIndexSort<T>::ScalarIndexSort(const size_t n, const T* values)
: is_built_(false) {
ScalarIndexSort<T>::BuildWithDataset(n, values);
inline ScalarIndexSort<T>::ScalarIndexSort(
storage::FileManagerImplPtr file_manager)
: is_built_(false), data_() {
if (file_manager != nullptr) {
file_manager_ = std::dynamic_pointer_cast<storage::MemFileManagerImpl>(
file_manager);
}
}
template <typename T>
inline void
ScalarIndexSort<T>::Build(const size_t n, const T* values) {
ScalarIndexSort<T>::Build(const Config& config) {
if (is_built_)
return;
auto insert_files =
GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
AssertInfo(insert_files.has_value(),
"insert file paths is empty when build index");
auto field_datas =
file_manager_->CacheRawDataToMemory(insert_files.value());
int64_t total_num_rows = 0;
for (auto data : field_datas) {
total_num_rows += data->get_num_rows();
}
if (total_num_rows == 0) {
// todo: throw an exception
throw std::invalid_argument(
"ScalarIndexSort cannot build null values!");
}
data_.reserve(total_num_rows);
int64_t offset = 0;
for (auto data : field_datas) {
auto slice_num = data->get_num_rows();
for (size_t i = 0; i < slice_num; ++i) {
auto value = reinterpret_cast<const T*>(data->RawValue(i));
data_.emplace_back(IndexStructure(*value, offset));
offset++;
}
}
std::sort(data_.begin(), data_.end());
idx_to_offsets_.resize(total_num_rows);
for (size_t i = 0; i < total_num_rows; ++i) {
idx_to_offsets_[data_[i].idx_] = i;
}
is_built_ = true;
}
template <typename T>
inline void
ScalarIndexSort<T>::Build(size_t n, const T* values) {
if (is_built_)
return;
if (n == 0) {
@ -82,11 +124,26 @@ ScalarIndexSort<T>::Serialize(const Config& config) {
return res_set;
}
template <typename T>
inline BinarySet
ScalarIndexSort<T>::Upload(const Config& config) {
auto binary_set = Serialize(config);
file_manager_->AddFile(binary_set);
auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize();
BinarySet ret;
for (auto& file : remote_paths_to_size) {
ret.Append(file.first, nullptr, file.second);
}
return ret;
}
template <typename T>
inline void
ScalarIndexSort<T>::Load(const BinarySet& index_binary, const Config& config) {
ScalarIndexSort<T>::LoadWithoutAssemble(const BinarySet& index_binary,
const Config& config) {
size_t index_size;
milvus::Assemble(const_cast<BinarySet&>(index_binary));
auto index_length = index_binary.GetByName("index_length");
memcpy(&index_size, index_length->data.get(), (size_t)index_length->size);
@ -100,6 +157,34 @@ ScalarIndexSort<T>::Load(const BinarySet& index_binary, const Config& config) {
is_built_ = true;
}
template <typename T>
inline void
ScalarIndexSort<T>::Load(const BinarySet& index_binary, const Config& config) {
milvus::Assemble(const_cast<BinarySet&>(index_binary));
LoadWithoutAssemble(index_binary, config);
}
template <typename T>
inline void
ScalarIndexSort<T>::Load(const Config& config) {
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
AssertInfo(index_files.has_value(),
"index file paths is empty when load disk ann index");
auto index_datas = file_manager_->LoadIndexToMemory(index_files.value());
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
binary_set.Append(key, buf, size);
}
LoadWithoutAssemble(binary_set, config);
}
template <typename T>
inline const TargetBitmap
ScalarIndexSort<T>::In(const size_t n, const T* values) {

View File

@ -21,16 +21,19 @@
#include <utility>
#include <vector>
#include <string>
#include <map>
#include "index/IndexStructure.h"
#include "index/ScalarIndex.h"
#include "storage/MemFileManagerImpl.h"
namespace milvus::index {
template <typename T>
class ScalarIndexSort : public ScalarIndex<T> {
public:
ScalarIndexSort();
ScalarIndexSort(size_t n, const T* values);
explicit ScalarIndexSort(
storage::FileManagerImplPtr file_manager = nullptr);
BinarySet
Serialize(const Config& config) override;
@ -38,6 +41,9 @@ class ScalarIndexSort : public ScalarIndex<T> {
void
Load(const BinarySet& index_binary, const Config& config = {}) override;
void
Load(const Config& config = {}) override;
int64_t
Count() override {
return data_.size();
@ -46,6 +52,9 @@ class ScalarIndexSort : public ScalarIndex<T> {
void
Build(size_t n, const T* values) override;
void
Build(const Config& config = {}) override;
const TargetBitmap
In(size_t n, const T* values) override;
@ -69,6 +78,9 @@ class ScalarIndexSort : public ScalarIndex<T> {
return (int64_t)data_.size();
}
BinarySet
Upload(const Config& config = {}) override;
public:
const std::vector<IndexStructure<T>>&
GetData() {
@ -80,11 +92,15 @@ class ScalarIndexSort : public ScalarIndex<T> {
return is_built_;
}
void
LoadWithoutAssemble(const BinarySet& binary_set, const Config& config);
private:
bool is_built_;
Config config_;
std::vector<int32_t> idx_to_offsets_; // used to retrieve.
std::vector<IndexStructure<T>> data_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
};
template <typename T>
@ -97,7 +113,7 @@ using ScalarIndexSortPtr = std::unique_ptr<ScalarIndexSort<T>>;
namespace milvus::index {
template <typename T>
inline ScalarIndexSortPtr<T>
CreateScalarIndexSort() {
return std::make_unique<ScalarIndexSort<T>>();
CreateScalarIndexSort(storage::FileManagerImplPtr file_manager = nullptr) {
return std::make_unique<ScalarIndexSort<T>>(file_manager);
}
} // namespace milvus::index

View File

@ -31,11 +31,77 @@ namespace milvus::index {
#if defined(__linux__) || defined(__APPLE__)
class UnistdException : public std::runtime_error {
public:
explicit UnistdException(const std::string& msg) : std::runtime_error(msg) {
}
virtual ~UnistdException() {
}
};
StringIndexMarisa::StringIndexMarisa(storage::FileManagerImplPtr file_manager) {
if (file_manager != nullptr) {
file_manager_ = std::dynamic_pointer_cast<storage::MemFileManagerImpl>(
file_manager);
}
}
int64_t
StringIndexMarisa::Size() {
return trie_.size();
}
bool
valid_str_id(size_t str_id) {
return str_id >= 0 && str_id != MARISA_INVALID_KEY_ID;
}
void
StringIndexMarisa::Build(const Config& config) {
if (built_) {
throw std::runtime_error("index has been built");
}
auto insert_files =
GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
AssertInfo(insert_files.has_value(),
"insert file paths is empty when build index");
auto field_datas =
file_manager_->CacheRawDataToMemory(insert_files.value());
int64_t total_num_rows = 0;
// fill key set.
marisa::Keyset keyset;
for (auto data : field_datas) {
auto slice_num = data->get_num_rows();
for (size_t i = 0; i < slice_num; ++i) {
keyset.push_back(
(*static_cast<const std::string*>(data->RawValue(i))).c_str());
}
total_num_rows += slice_num;
}
trie_.build(keyset);
// fill str_ids_
str_ids_.resize(total_num_rows);
int64_t offset = 0;
for (auto data : field_datas) {
auto slice_num = data->get_num_rows();
for (size_t i = 0; i < slice_num; ++i) {
auto str_id =
lookup(*static_cast<const std::string*>(data->RawValue(i)));
AssertInfo(valid_str_id(str_id), "invalid marisa key");
str_ids_[offset++] = str_id;
}
}
// fill str_ids_to_offsets_
fill_offsets();
built_ = true;
}
void
StringIndexMarisa::Build(size_t n, const std::string* values) {
if (built_) {
@ -68,15 +134,17 @@ StringIndexMarisa::Serialize(const Config& config) {
trie_.write(fd);
auto size = get_file_size(fd);
auto buf = new uint8_t[size];
auto index_data = std::shared_ptr<uint8_t[]>(new uint8_t[size]);
while (read(fd, buf, size) != size) {
lseek(fd, 0, SEEK_SET);
}
std::shared_ptr<uint8_t[]> index_data(buf);
auto status = read(fd, index_data.get(), size);
close(fd);
remove(file.c_str());
if (status != size) {
throw UnistdException("read index from fd error, errorCode is " +
std::to_string(status));
}
auto str_ids_len = str_ids_.size() * sizeof(size_t);
std::shared_ptr<uint8_t[]> str_ids(new uint8_t[str_ids_len]);
@ -86,15 +154,28 @@ StringIndexMarisa::Serialize(const Config& config) {
res_set.Append(MARISA_TRIE_INDEX, index_data, size);
res_set.Append(MARISA_STR_IDS, str_ids, str_ids_len);
milvus::Disassemble(res_set);
Disassemble(res_set);
return res_set;
}
void
StringIndexMarisa::Load(const BinarySet& set, const Config& config) {
milvus::Assemble(const_cast<BinarySet&>(set));
BinarySet
StringIndexMarisa::Upload(const Config& config) {
auto binary_set = Serialize(config);
file_manager_->AddFile(binary_set);
auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize();
BinarySet ret;
for (auto& file : remote_paths_to_size) {
ret.Append(file.first, nullptr, file.second);
}
return ret;
}
void
StringIndexMarisa::LoadWithoutAssemble(const BinarySet& set,
const Config& config) {
auto uuid = boost::uuids::random_generator()();
auto uuid_string = boost::uuids::to_string(uuid);
auto file = std::string("/tmp/") + uuid_string;
@ -105,8 +186,13 @@ StringIndexMarisa::Load(const BinarySet& set, const Config& config) {
auto fd = open(
file.c_str(), O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IXUSR);
lseek(fd, 0, SEEK_SET);
while (write(fd, index->data.get(), len) != len) {
lseek(fd, 0, SEEK_SET);
auto status = write(fd, index->data.get(), len);
if (status != len) {
close(fd);
remove(file.c_str());
throw UnistdException("write index to fd error, errorCode is " +
std::to_string(status));
}
lseek(fd, 0, SEEK_SET);
@ -122,9 +208,30 @@ StringIndexMarisa::Load(const BinarySet& set, const Config& config) {
fill_offsets();
}
bool
valid_str_id(size_t str_id) {
return str_id >= 0 && str_id != MARISA_INVALID_KEY_ID;
void
StringIndexMarisa::Load(const BinarySet& set, const Config& config) {
milvus::Assemble(const_cast<BinarySet&>(set));
LoadWithoutAssemble(set, config);
}
void
StringIndexMarisa::Load(const Config& config) {
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
AssertInfo(index_files.has_value(),
"index file paths is empty when load index");
auto index_datas = file_manager_->LoadIndexToMemory(index_files.value());
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
binary_set.Append(key, buf, size);
}
LoadWithoutAssemble(binary_set, config);
}
const TargetBitmap
@ -248,7 +355,7 @@ StringIndexMarisa::fill_str_ids(size_t n, const std::string* values) {
for (size_t i = 0; i < n; i++) {
auto str = values[i];
auto str_id = lookup(str);
assert(valid_str_id(str_id));
AssertInfo(valid_str_id(str_id), "invalid marisa key");
str_ids_[i] = str_id;
}
}

View File

@ -24,12 +24,14 @@
#include <vector>
#include <map>
#include <memory>
#include "storage/MemFileManagerImpl.h"
namespace milvus::index {
class StringIndexMarisa : public StringIndex {
public:
StringIndexMarisa() = default;
explicit StringIndexMarisa(
storage::FileManagerImplPtr file_manager = nullptr);
int64_t
Size() override;
@ -40,6 +42,9 @@ class StringIndexMarisa : public StringIndex {
void
Load(const BinarySet& set, const Config& config = {}) override;
void
Load(const Config& config = {}) override;
int64_t
Count() override {
return str_ids_.size();
@ -48,6 +53,9 @@ class StringIndexMarisa : public StringIndex {
void
Build(size_t n, const std::string* values) override;
void
Build(const Config& config = {}) override;
const TargetBitmap
In(size_t n, const std::string* values) override;
@ -69,6 +77,9 @@ class StringIndexMarisa : public StringIndex {
std::string
Reverse_Lookup(size_t offset) const override;
BinarySet
Upload(const Config& config = {}) override;
private:
void
fill_str_ids(size_t n, const std::string* values);
@ -83,19 +94,23 @@ class StringIndexMarisa : public StringIndex {
std::vector<size_t>
prefix_match(const std::string_view prefix);
void
LoadWithoutAssemble(const BinarySet& binary_set, const Config& config);
private:
Config config_;
marisa::Trie trie_;
std::vector<size_t> str_ids_; // used to retrieve.
std::map<size_t, std::vector<size_t>> str_ids_to_offsets_;
bool built_ = false;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
};
using StringIndexMarisaPtr = std::unique_ptr<StringIndexMarisa>;
inline StringIndexPtr
CreateStringIndexMarisa() {
return std::make_unique<StringIndexMarisa>();
CreateStringIndexMarisa(storage::FileManagerImplPtr file_manager = nullptr) {
return std::make_unique<StringIndexMarisa>(file_manager);
}
} // namespace milvus::index

View File

@ -25,6 +25,9 @@
#include <google/protobuf/text_format.h>
#include "exceptions/EasyAssert.h"
#include "knowhere/comp/index_param.h"
#include "common/Slice.h"
#include "storage/Util.h"
namespace milvus::index {
size_t
@ -51,14 +54,6 @@ BIN_List() {
return ret;
}
std::vector<IndexType>
DISK_LIST() {
static std::vector<IndexType> ret{
knowhere::IndexEnum::INDEX_DISKANN,
};
return ret;
}
std::vector<std::tuple<IndexType, MetricType>>
unsupported_index_combinations() {
static std::vector<std::tuple<IndexType, MetricType>> ret{
@ -78,11 +73,6 @@ is_in_nm_list(const IndexType& index_type) {
return is_in_list<IndexType>(index_type, NM_List);
}
bool
is_in_disk_list(const IndexType& index_type) {
return is_in_list<IndexType>(index_type, DISK_LIST);
}
bool
is_unsupported(const IndexType& index_type, const MetricType& metric_type) {
return is_in_list<std::tuple<IndexType, MetricType>>(
@ -197,4 +187,36 @@ ParseConfigFromIndexParams(
return config;
}
void
AssembleIndexDatas(std::map<std::string, storage::FieldDataPtr>& index_datas) {
if (index_datas.find(INDEX_FILE_SLICE_META) != index_datas.end()) {
auto slice_meta = index_datas.at(INDEX_FILE_SLICE_META);
Config meta_data = Config::parse(std::string(
static_cast<const char*>(slice_meta->Data()), slice_meta->Size()));
for (auto& item : meta_data[META]) {
std::string prefix = item[NAME];
int slice_num = item[SLICE_NUM];
auto total_len = static_cast<size_t>(item[TOTAL_LEN]);
auto new_field_data =
storage::CreateFieldData(DataType::INT8, 1, total_len);
for (auto i = 0; i < slice_num; ++i) {
std::string file_name = GenSlicedFileName(prefix, i);
AssertInfo(index_datas.find(file_name) != index_datas.end(),
"lost index slice data");
auto data = index_datas.at(file_name);
auto len = data->Size();
new_field_data->FillFieldData(data->Data(), len);
index_datas.erase(file_name);
}
AssertInfo(
new_field_data->IsFull(),
"index len is inconsistent after disassemble and assemble");
index_datas[prefix] = new_field_data;
}
}
}
} // namespace milvus::index

View File

@ -29,6 +29,7 @@
#include "common/Types.h"
#include "index/IndexInfo.h"
#include "storage/Types.h"
#include "storage/FieldData.h"
namespace milvus::index {
@ -44,22 +45,12 @@ BIN_List();
std::vector<std::tuple<IndexType, MetricType>>
unsupported_index_combinations();
template <typename T>
inline bool
is_in_list(const T& t, std::function<std::vector<T>()> list_func) {
auto l = list_func();
return std::find(l.begin(), l.end(), t) != l.end();
}
bool
is_in_bin_list(const IndexType& index_type);
bool
is_in_nm_list(const IndexType& index_type);
bool
is_in_disk_list(const IndexType& index_type);
bool
is_unsupported(const IndexType& index_type, const MetricType& metric_type);
@ -118,4 +109,7 @@ Config
ParseConfigFromIndexParams(
const std::map<std::string, std::string>& index_params);
void
AssembleIndexDatas(std::map<std::string, storage::FieldDataPtr>& index_datas);
} // namespace milvus::index

View File

@ -20,7 +20,7 @@
#include "config/ConfigKnowhere.h"
#include "index/Meta.h"
#include "index/Utils.h"
#include "storage/LocalChunkManager.h"
#include "storage/LocalChunkManagerSingleton.h"
#include "storage/Util.h"
#include "common/Consts.h"
#include "common/RangeSearchHelper.h"
@ -42,17 +42,18 @@ VectorDiskAnnIndex<T>::VectorDiskAnnIndex(
: VectorIndex(index_type, metric_type) {
file_manager_ =
std::dynamic_pointer_cast<storage::DiskFileManagerImpl>(file_manager);
auto& local_chunk_manager = storage::LocalChunkManager::GetInstance();
auto local_chunk_manager =
storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager();
auto local_index_path_prefix = file_manager_->GetLocalIndexObjectPrefix();
// As we have guarded dup-load in QueryNode,
// this assertion failed only if the Milvus rebooted in the same pod,
// need to remove these files then re-load the segment
if (local_chunk_manager.Exist(local_index_path_prefix)) {
local_chunk_manager.RemoveDir(local_index_path_prefix);
if (local_chunk_manager->Exist(local_index_path_prefix)) {
local_chunk_manager->RemoveDir(local_index_path_prefix);
}
local_chunk_manager.CreateDir(local_index_path_prefix);
local_chunk_manager->CreateDir(local_index_path_prefix);
auto diskann_index_pack =
knowhere::Pack(std::shared_ptr<knowhere::FileManager>(file_manager));
index_ = knowhere::IndexFactory::Instance().Create(GetIndexType(),
@ -63,6 +64,12 @@ template <typename T>
void
VectorDiskAnnIndex<T>::Load(const BinarySet& binary_set /* not used */,
const Config& config) {
Load(config);
}
template <typename T>
void
VectorDiskAnnIndex<T>::Load(const Config& config) {
knowhere::Json load_config = update_load_json(config);
auto index_files =
@ -80,18 +87,65 @@ VectorDiskAnnIndex<T>::Load(const BinarySet& binary_set /* not used */,
SetDim(index_.Dim());
}
template <typename T>
BinarySet
VectorDiskAnnIndex<T>::Upload(const Config& config) {
auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize();
BinarySet ret;
for (auto& file : remote_paths_to_size) {
ret.Append(file.first, nullptr, file.second);
}
return ret;
}
template <typename T>
void
VectorDiskAnnIndex<T>::Build(const Config& config) {
auto local_chunk_manager =
storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager();
knowhere::Json build_config;
build_config.update(config);
auto segment_id = file_manager_->GetFieldDataMeta().segment_id;
auto insert_files =
GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
AssertInfo(insert_files.has_value(),
"insert file paths is empty when build disk ann index");
auto local_data_path =
file_manager_->CacheRawDataToDisk(insert_files.value());
build_config[DISK_ANN_RAW_DATA_PATH] = local_data_path;
auto local_index_path_prefix = file_manager_->GetLocalIndexObjectPrefix();
build_config[DISK_ANN_PREFIX_PATH] = local_index_path_prefix;
auto num_threads = GetValueFromConfig<std::string>(
build_config, DISK_ANN_BUILD_THREAD_NUM);
AssertInfo(num_threads.has_value(),
"param " + std::string(DISK_ANN_BUILD_THREAD_NUM) + "is empty");
build_config[DISK_ANN_THREADS_NUM] = std::atoi(num_threads.value().c_str());
knowhere::DataSet* ds_ptr = nullptr;
build_config.erase("insert_files");
index_.Build(*ds_ptr, build_config);
local_chunk_manager->RemoveDir(
storage::GetSegmentRawDataPathPrefix(local_chunk_manager, segment_id));
}
template <typename T>
void
VectorDiskAnnIndex<T>::BuildWithDataset(const DatasetPtr& dataset,
const Config& config) {
auto& local_chunk_manager = storage::LocalChunkManager::GetInstance();
auto local_chunk_manager =
storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager();
knowhere::Json build_config;
build_config.update(config);
// set data path
auto segment_id = file_manager_->GetFileDataMeta().segment_id;
auto field_id = file_manager_->GetFileDataMeta().field_id;
auto local_data_path =
storage::GenFieldRawDataPathPrefix(segment_id, field_id) + "raw_data";
auto segment_id = file_manager_->GetFieldDataMeta().segment_id;
auto field_id = file_manager_->GetFieldDataMeta().field_id;
auto local_data_path = storage::GenFieldRawDataPathPrefix(
local_chunk_manager, segment_id, field_id) +
"raw_data";
build_config[DISK_ANN_RAW_DATA_PATH] = local_data_path;
auto local_index_path_prefix = file_manager_->GetLocalIndexObjectPrefix();
@ -103,30 +157,31 @@ VectorDiskAnnIndex<T>::BuildWithDataset(const DatasetPtr& dataset,
"param " + std::string(DISK_ANN_BUILD_THREAD_NUM) + "is empty");
build_config[DISK_ANN_THREADS_NUM] = std::atoi(num_threads.value().c_str());
if (!local_chunk_manager.Exist(local_data_path)) {
local_chunk_manager.CreateFile(local_data_path);
if (!local_chunk_manager->Exist(local_data_path)) {
local_chunk_manager->CreateFile(local_data_path);
}
int64_t offset = 0;
auto num = uint32_t(milvus::GetDatasetRows(dataset));
local_chunk_manager.Write(local_data_path, offset, &num, sizeof(num));
local_chunk_manager->Write(local_data_path, offset, &num, sizeof(num));
offset += sizeof(num);
auto dim = uint32_t(milvus::GetDatasetDim(dataset));
local_chunk_manager.Write(local_data_path, offset, &dim, sizeof(dim));
local_chunk_manager->Write(local_data_path, offset, &dim, sizeof(dim));
offset += sizeof(dim);
auto data_size = num * dim * sizeof(float);
auto raw_data = const_cast<void*>(milvus::GetDatasetTensor(dataset));
local_chunk_manager.Write(local_data_path, offset, raw_data, data_size);
local_chunk_manager->Write(local_data_path, offset, raw_data, data_size);
knowhere::DataSet* ds_ptr = nullptr;
auto stat = index_.Build(*ds_ptr, build_config);
if (stat != knowhere::Status::success)
PanicCodeInfo(ErrorCodeEnum::BuildIndexError,
"failed to build index, " + MatchKnowhereError(stat));
local_chunk_manager.RemoveDir(
storage::GetSegmentRawDataPathPrefix(segment_id));
local_chunk_manager->RemoveDir(
storage::GetSegmentRawDataPathPrefix(local_chunk_manager, segment_id));
// TODO ::
// SetDim(index_->Dim());
}
@ -263,9 +318,11 @@ VectorDiskAnnIndex<T>::GetVector(const DatasetPtr dataset) const {
template <typename T>
void
VectorDiskAnnIndex<T>::CleanLocalData() {
auto& local_chunk_manager = storage::LocalChunkManager::GetInstance();
local_chunk_manager.RemoveDir(file_manager_->GetLocalIndexObjectPrefix());
local_chunk_manager.RemoveDir(file_manager_->GetLocalRawDataObjectPrefix());
auto local_chunk_manager =
storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager();
local_chunk_manager->RemoveDir(file_manager_->GetLocalIndexObjectPrefix());
local_chunk_manager->RemoveDir(
file_manager_->GetLocalRawDataObjectPrefix());
}
template <typename T>

View File

@ -33,7 +33,7 @@ class VectorDiskAnnIndex : public VectorIndex {
const MetricType& metric_type,
storage::FileManagerImplPtr file_manager);
BinarySet
Serialize(const Config& config) override {
Serialize(const Config& config) override { // deprecated
auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize();
BinarySet binary_set;
for (auto& file : remote_paths_to_size) {
@ -43,6 +43,9 @@ class VectorDiskAnnIndex : public VectorIndex {
return binary_set;
}
BinarySet
Upload(const Config& config = {}) override;
int64_t
Count() override {
return index_.Count();
@ -52,10 +55,16 @@ class VectorDiskAnnIndex : public VectorIndex {
Load(const BinarySet& binary_set /* not used */,
const Config& config = {}) override;
void
Load(const Config& config = {}) override;
void
BuildWithDataset(const DatasetPtr& dataset,
const Config& config = {}) override;
void
Build(const Config& config = {}) override;
std::unique_ptr<SearchResult>
Query(const DatasetPtr dataset,
const SearchInfo& search_info,

View File

@ -33,14 +33,32 @@
namespace milvus::index {
VectorMemIndex::VectorMemIndex(const IndexType& index_type,
const MetricType& metric_type)
const MetricType& metric_type,
storage::FileManagerImplPtr file_manager)
: VectorIndex(index_type, metric_type) {
AssertInfo(!is_unsupported(index_type, metric_type),
index_type + " doesn't support metric: " + metric_type);
if (file_manager != nullptr) {
file_manager_ = std::dynamic_pointer_cast<storage::MemFileManagerImpl>(
file_manager);
}
index_ = knowhere::IndexFactory::Instance().Create(GetIndexType());
}
BinarySet
VectorMemIndex::Upload(const Config& config) {
auto binary_set = Serialize(config);
file_manager_->AddFile(binary_set);
auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize();
BinarySet ret;
for (auto& file : remote_paths_to_size) {
ret.Append(file.first, nullptr, file.second);
}
return ret;
}
BinarySet
VectorMemIndex::Serialize(const Config& config) {
knowhere::BinarySet ret;
@ -48,14 +66,14 @@ VectorMemIndex::Serialize(const Config& config) {
if (stat != knowhere::Status::success)
PanicCodeInfo(ErrorCodeEnum::UnexpectedError,
"failed to serialize index, " + MatchKnowhereError(stat));
milvus::Disassemble(ret);
Disassemble(ret);
return ret;
}
void
VectorMemIndex::Load(const BinarySet& binary_set, const Config& config) {
milvus::Assemble(const_cast<BinarySet&>(binary_set));
VectorMemIndex::LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) {
auto stat = index_.Deserialize(binary_set);
if (stat != knowhere::Status::success)
PanicCodeInfo(
@ -64,6 +82,31 @@ VectorMemIndex::Load(const BinarySet& binary_set, const Config& config) {
SetDim(index_.Dim());
}
void
VectorMemIndex::Load(const BinarySet& binary_set, const Config& config) {
milvus::Assemble(const_cast<BinarySet&>(binary_set));
LoadWithoutAssemble(binary_set, config);
}
void
VectorMemIndex::Load(const Config& config) {
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
AssertInfo(index_files.has_value(),
"index file paths is empty when load index");
auto index_datas = file_manager_->LoadIndexToMemory(index_files.value());
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
binary_set.Append(key, buf, size);
}
LoadWithoutAssemble(binary_set, config);
}
void
VectorMemIndex::BuildWithDataset(const DatasetPtr& dataset,
const Config& config) {
@ -81,6 +124,43 @@ VectorMemIndex::BuildWithDataset(const DatasetPtr& dataset,
SetDim(index_.Dim());
}
void
VectorMemIndex::Build(const Config& config) {
auto insert_files =
GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
AssertInfo(insert_files.has_value(),
"insert file paths is empty when build disk ann index");
auto field_datas =
file_manager_->CacheRawDataToMemory(insert_files.value());
int64_t total_size = 0;
int64_t total_num_rows = 0;
int64_t dim = 0;
for (auto data : field_datas) {
total_size += data->Size();
total_num_rows += data->get_num_rows();
AssertInfo(dim == 0 || dim == data->get_dim(),
"inconsistent dim value between field datas!");
dim = data->get_dim();
}
auto buf = std::shared_ptr<uint8_t[]>(new uint8_t[total_size]);
int64_t offset = 0;
for (auto data : field_datas) {
std::memcpy(buf.get() + offset, data->Data(), data->Size());
offset += data->Size();
data.reset();
}
field_datas.clear();
Config build_config;
build_config.update(config);
build_config.erase("insert_files");
auto dataset = GenDataset(total_num_rows, dim, buf.get());
BuildWithDataset(dataset, build_config);
}
void
VectorMemIndex::AddWithDataset(const DatasetPtr& dataset,
const Config& config) {

View File

@ -23,13 +23,15 @@
#include <boost/dynamic_bitset.hpp>
#include "knowhere/factory.h"
#include "index/VectorIndex.h"
#include "storage/MemFileManagerImpl.h"
namespace milvus::index {
class VectorMemIndex : public VectorIndex {
public:
explicit VectorMemIndex(const IndexType& index_type,
const MetricType& metric_type);
const MetricType& metric_type,
storage::FileManagerImplPtr file_manager = nullptr);
BinarySet
Serialize(const Config& config) override;
@ -37,10 +39,16 @@ class VectorMemIndex : public VectorIndex {
void
Load(const BinarySet& binary_set, const Config& config = {}) override;
void
Load(const Config& config = {}) override;
void
BuildWithDataset(const DatasetPtr& dataset,
const Config& config = {}) override;
void
Build(const Config& config = {}) override;
void
AddWithDataset(const DatasetPtr& dataset, const Config& config) override;
@ -60,9 +68,17 @@ class VectorMemIndex : public VectorIndex {
const std::vector<uint8_t>
GetVector(const DatasetPtr dataset) const override;
BinarySet
Upload(const Config& config = {}) override;
protected:
virtual void
LoadWithoutAssemble(const BinarySet& binary_set, const Config& config);
protected:
Config config_;
knowhere::Index<knowhere::IndexNode> index_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
};
using VectorMemIndexPtr = std::unique_ptr<VectorMemIndex>;

View File

@ -38,7 +38,7 @@ VectorMemNMIndex::Serialize(const Config& config) {
auto raw_data = std::shared_ptr<uint8_t[]>(
static_cast<uint8_t*>(raw_data_.data()), deleter);
ret.Append(RAW_DATA, raw_data, raw_data_.size());
milvus::Disassemble(ret);
Disassemble(ret);
return ret;
}
@ -52,6 +52,17 @@ VectorMemNMIndex::BuildWithDataset(const DatasetPtr& dataset,
rc.ElapseFromBegin("Done");
}
void
VectorMemNMIndex::LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) {
VectorMemIndex::LoadWithoutAssemble(binary_set, config);
if (binary_set.Contains(RAW_DATA)) {
std::call_once(raw_data_loaded_, [&]() {
LOG_SEGCORE_INFO_ << "NM index load raw data done!";
});
}
}
void
VectorMemNMIndex::AddWithDataset(const DatasetPtr& /*dataset*/,
const Config& /*config*/) {

View File

@ -28,9 +28,11 @@ namespace milvus::index {
class VectorMemNMIndex : public VectorMemIndex {
public:
explicit VectorMemNMIndex(const IndexType& index_type,
const MetricType& metric_type)
: VectorMemIndex(index_type, metric_type) {
explicit VectorMemNMIndex(
const IndexType& index_type,
const MetricType& metric_type,
storage::FileManagerImplPtr file_manager = nullptr)
: VectorMemIndex(index_type, metric_type, file_manager) {
AssertInfo(is_in_nm_list(index_type), "not valid nm index type");
}
@ -52,6 +54,10 @@ class VectorMemNMIndex : public VectorMemIndex {
const SearchInfo& search_info,
const BitsetView& bitset) override;
void
LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) override;
private:
void
store_raw_data(const DatasetPtr& dataset);

View File

@ -13,6 +13,7 @@
#include <memory>
#include "common/Types.h"
#include "storage/FileManager.h"
namespace milvus::indexbuilder {
class IndexCreatorBase {
@ -22,12 +23,18 @@ class IndexCreatorBase {
virtual void
Build(const milvus::DatasetPtr& dataset) = 0;
virtual void
Build() = 0;
virtual milvus::BinarySet
Serialize() = 0;
// used for test.
virtual void
Load(const milvus::BinarySet&) = 0;
virtual BinarySet
Upload() = 0;
};
using IndexCreatorBasePtr = std::unique_ptr<IndexCreatorBase>;

View File

@ -13,13 +13,15 @@
#include <pb/schema.pb.h>
#include <cmath>
#include <memory>
#include <string>
#include "indexbuilder/IndexCreatorBase.h"
#include "indexbuilder/ScalarIndexCreator.h"
#include "indexbuilder/VecIndexCreator.h"
#include "indexbuilder/type_c.h"
#include "storage/Types.h"
#include <memory>
#include <string>
#include "storage/FileManager.h"
namespace milvus::indexbuilder {
@ -40,15 +42,13 @@ class IndexFactory {
}
IndexCreatorBasePtr
CreateIndex(CDataType dtype,
const char* type_params,
const char* index_params,
const storage::StorageConfig& storage_config) {
auto real_dtype = DataType(dtype);
auto invalid_dtype_msg = std::string("invalid data type: ") +
std::to_string(int(real_dtype));
CreateIndex(DataType type,
Config& config,
storage::FileManagerImplPtr file_manager) {
auto invalid_dtype_msg =
std::string("invalid data type: ") + std::to_string(int(type));
switch (real_dtype) {
switch (type) {
case DataType::BOOL:
case DataType::INT8:
case DataType::INT16:
@ -58,12 +58,12 @@ class IndexFactory {
case DataType::DOUBLE:
case DataType::VARCHAR:
case DataType::STRING:
return CreateScalarIndex(real_dtype, type_params, index_params);
return CreateScalarIndex(type, config, file_manager);
case DataType::VECTOR_FLOAT:
case DataType::VECTOR_BINARY:
return std::make_unique<VecIndexCreator>(
real_dtype, type_params, index_params, storage_config);
type, config, file_manager);
default:
throw std::invalid_argument(invalid_dtype_msg);
}

View File

@ -21,30 +21,14 @@
namespace milvus::indexbuilder {
ScalarIndexCreator::ScalarIndexCreator(DataType dtype,
const char* type_params,
const char* index_params)
: dtype_(dtype) {
// TODO: move parse-related logic to a common interface.
proto::indexcgo::TypeParams type_params_;
proto::indexcgo::IndexParams index_params_;
milvus::index::ParseFromString(type_params_, std::string(type_params));
milvus::index::ParseFromString(index_params_, std::string(index_params));
for (auto i = 0; i < type_params_.params_size(); ++i) {
const auto& param = type_params_.params(i);
config_[param.key()] = param.value();
}
for (auto i = 0; i < index_params_.params_size(); ++i) {
const auto& param = index_params_.params(i);
config_[param.key()] = param.value();
}
Config& config,
storage::FileManagerImplPtr file_manager)
: dtype_(dtype), config_(config) {
milvus::index::CreateIndexInfo index_info;
index_info.field_type = dtype_;
index_info.index_type = index_type();
index_ =
index::IndexFactory::GetInstance().CreateIndex(index_info, nullptr);
index_ = index::IndexFactory::GetInstance().CreateIndex(index_info,
file_manager);
}
void
@ -54,6 +38,11 @@ ScalarIndexCreator::Build(const milvus::DatasetPtr& dataset) {
index_->BuildWithRawData(size, data);
}
void
ScalarIndexCreator::Build() {
index_->Build(config_);
}
milvus::BinarySet
ScalarIndexCreator::Serialize() {
return index_->Serialize(config_);
@ -70,4 +59,9 @@ ScalarIndexCreator::index_type() {
return "sort";
}
BinarySet
ScalarIndexCreator::Upload() {
return index_->Upload();
}
} // namespace milvus::indexbuilder

View File

@ -23,18 +23,24 @@ namespace milvus::indexbuilder {
class ScalarIndexCreator : public IndexCreatorBase {
public:
ScalarIndexCreator(DataType data_type,
const char* type_params,
const char* index_params);
Config& config,
storage::FileManagerImplPtr file_manager);
void
Build(const milvus::DatasetPtr& dataset) override;
void
Build() override;
milvus::BinarySet
Serialize() override;
void
Load(const milvus::BinarySet&) override;
BinarySet
Upload() override;
private:
std::string
index_type();
@ -49,10 +55,9 @@ using ScalarIndexCreatorPtr = std::unique_ptr<ScalarIndexCreator>;
inline ScalarIndexCreatorPtr
CreateScalarIndex(DataType dtype,
const char* type_params,
const char* index_params) {
return std::make_unique<ScalarIndexCreator>(
dtype, type_params, index_params);
Config& config,
storage::FileManagerImplPtr file_manager) {
return std::make_unique<ScalarIndexCreator>(dtype, config, file_manager);
}
} // namespace milvus::indexbuilder

View File

@ -17,50 +17,17 @@
#include "index/IndexFactory.h"
#include "pb/index_cgo_msg.pb.h"
#ifdef BUILD_DISK_ANN
#include "storage/DiskFileManagerImpl.h"
#endif
namespace milvus::indexbuilder {
VecIndexCreator::VecIndexCreator(DataType data_type,
const char* serialized_type_params,
const char* serialized_index_params,
const storage::StorageConfig& storage_config)
: data_type_(data_type) {
proto::indexcgo::TypeParams type_params_;
proto::indexcgo::IndexParams index_params_;
milvus::index::ParseFromString(type_params_,
std::string(serialized_type_params));
milvus::index::ParseFromString(index_params_,
std::string(serialized_index_params));
for (auto i = 0; i < type_params_.params_size(); ++i) {
const auto& param = type_params_.params(i);
config_[param.key()] = param.value();
}
for (auto i = 0; i < index_params_.params_size(); ++i) {
const auto& param = index_params_.params(i);
config_[param.key()] = param.value();
}
Config& config,
storage::FileManagerImplPtr file_manager)
: data_type_(data_type), config_(config) {
index::CreateIndexInfo index_info;
index_info.field_type = data_type_;
index_info.index_type = index::GetIndexTypeFromConfig(config_);
index_info.metric_type = index::GetMetricTypeFromConfig(config_);
std::shared_ptr<storage::FileManagerImpl> file_manager = nullptr;
#ifdef BUILD_DISK_ANN
if (index::is_in_disk_list(index_info.index_type)) {
// For now, only support diskann index
file_manager = std::make_shared<storage::DiskFileManagerImpl>(
index::GetFieldDataMetaFromConfig(config_),
index::GetIndexMetaFromConfig(config_),
storage_config);
}
#endif
index_ = index::IndexFactory::GetInstance().CreateIndex(index_info,
file_manager);
AssertInfo(index_ != nullptr,
@ -77,6 +44,11 @@ VecIndexCreator::Build(const milvus::DatasetPtr& dataset) {
index_->BuildWithDataset(dataset, config_);
}
void
VecIndexCreator::Build() {
index_->Build(config_);
}
milvus::BinarySet
VecIndexCreator::Serialize() {
return index_->Serialize(config_);
@ -95,6 +67,11 @@ VecIndexCreator::Query(const milvus::DatasetPtr& dataset,
return vector_index->Query(dataset, search_info, bitset);
}
BinarySet
VecIndexCreator::Upload() {
return index_->Upload();
}
void
VecIndexCreator::CleanLocalData() {
auto vector_index = dynamic_cast<index::VectorIndex*>(index_.get());

View File

@ -27,13 +27,15 @@ namespace milvus::indexbuilder {
class VecIndexCreator : public IndexCreatorBase {
public:
explicit VecIndexCreator(DataType data_type,
const char* serialized_type_params,
const char* serialized_index_params,
const storage::StorageConfig& storage_config);
Config& config,
storage::FileManagerImplPtr file_manager);
void
Build(const milvus::DatasetPtr& dataset) override;
void
Build() override;
milvus::BinarySet
Serialize() override;
@ -48,6 +50,9 @@ class VecIndexCreator : public IndexCreatorBase {
const SearchInfo& search_info,
const BitsetView& bitset);
BinarySet
Upload() override;
public:
void
CleanLocalData();

View File

@ -21,41 +21,40 @@
#include "indexbuilder/IndexFactory.h"
#include "common/type_c.h"
#include "storage/Types.h"
#include "indexbuilder/types.h"
#include "index/Utils.h"
#include "pb/index_cgo_msg.pb.h"
#include "storage/Util.h"
CStatus
CreateIndex(enum CDataType dtype,
const char* serialized_type_params,
const char* serialized_index_params,
CIndex* res_index,
CStorageConfig c_storage_config) {
CIndex* res_index) {
auto status = CStatus();
try {
AssertInfo(res_index, "failed to create index, passed index was null");
std::string address(c_storage_config.address);
std::string bucket_name(c_storage_config.bucket_name);
std::string access_key(c_storage_config.access_key_id);
std::string access_value(c_storage_config.access_key_value);
std::string remote_root_path(c_storage_config.remote_root_path);
std::string storage_type(c_storage_config.storage_type);
std::string iam_endpoint(c_storage_config.iam_endpoint);
auto storage_config =
milvus::storage::StorageConfig{address,
bucket_name,
access_key,
access_value,
remote_root_path,
storage_type,
iam_endpoint,
c_storage_config.useSSL,
c_storage_config.useIAM};
milvus::proto::indexcgo::TypeParams type_params;
milvus::proto::indexcgo::IndexParams index_params;
milvus::index::ParseFromString(type_params, serialized_type_params);
milvus::index::ParseFromString(index_params, serialized_index_params);
milvus::Config config;
for (auto i = 0; i < type_params.params_size(); ++i) {
const auto& param = type_params.params(i);
config[param.key()] = param.value();
}
for (auto i = 0; i < index_params.params_size(); ++i) {
const auto& param = index_params.params(i);
config[param.key()] = param.value();
}
auto& index_factory = milvus::indexbuilder::IndexFactory::GetInstance();
auto index =
milvus::indexbuilder::IndexFactory::GetInstance().CreateIndex(
dtype,
serialized_type_params,
serialized_index_params,
storage_config);
index_factory.CreateIndex(milvus::DataType(dtype), config, nullptr);
*res_index = index.release();
status.error_code = Success;
status.error_msg = "";
@ -66,6 +65,65 @@ CreateIndex(enum CDataType dtype,
return status;
}
CStatus
CreateIndexV2(CIndex* res_index, CBuildIndexInfo c_build_index_info) {
try {
auto build_index_info = (BuildIndexInfo*)c_build_index_info;
auto field_type = build_index_info->field_type;
milvus::index::CreateIndexInfo index_info;
index_info.field_type = build_index_info->field_type;
auto& config = build_index_info->config;
config["insert_files"] = build_index_info->insert_files;
// get index type
auto index_type = milvus::index::GetValueFromConfig<std::string>(
config, "index_type");
AssertInfo(index_type.has_value(), "index type is empty");
index_info.index_type = index_type.value();
// get metric type
if (milvus::datatype_is_vector(field_type)) {
auto metric_type = milvus::index::GetValueFromConfig<std::string>(
config, "metric_type");
AssertInfo(metric_type.has_value(), "metric type is empty");
index_info.metric_type = metric_type.value();
}
// init file manager
milvus::storage::FieldDataMeta field_meta{
build_index_info->collection_id,
build_index_info->partition_id,
build_index_info->segment_id,
build_index_info->field_id};
milvus::storage::IndexMeta index_meta{build_index_info->segment_id,
build_index_info->field_id,
build_index_info->index_build_id,
build_index_info->index_version};
auto chunk_manager = milvus::storage::CreateChunkManager(
build_index_info->storage_config);
auto file_manager = milvus::storage::CreateFileManager(
index_info.index_type, field_meta, index_meta, chunk_manager);
AssertInfo(file_manager != nullptr, "create file manager failed!");
auto index =
milvus::indexbuilder::IndexFactory::GetInstance().CreateIndex(
build_index_info->field_type, config, file_manager);
index->Build();
*res_index = index.release();
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
DeleteIndex(CIndex index) {
auto status = CStatus();
@ -219,3 +277,187 @@ CleanLocalData(CIndex index) {
}
return status;
}
CStatus
NewBuildIndexInfo(CBuildIndexInfo* c_build_index_info,
CStorageConfig c_storage_config) {
try {
auto build_index_info = std::make_unique<BuildIndexInfo>();
auto& storage_config = build_index_info->storage_config;
storage_config.address = std::string(c_storage_config.address);
storage_config.bucket_name = std::string(c_storage_config.bucket_name);
storage_config.access_key_id =
std::string(c_storage_config.access_key_id);
storage_config.access_key_value =
std::string(c_storage_config.access_key_value);
storage_config.root_path = std::string(c_storage_config.root_path);
storage_config.storage_type =
std::string(c_storage_config.storage_type);
storage_config.iam_endpoint =
std::string(c_storage_config.iam_endpoint);
storage_config.useSSL = c_storage_config.useSSL;
storage_config.useIAM = c_storage_config.useIAM;
*c_build_index_info = build_index_info.release();
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
void
DeleteBuildIndexInfo(CBuildIndexInfo c_build_index_info) {
auto info = (BuildIndexInfo*)c_build_index_info;
delete info;
}
CStatus
AppendBuildIndexParam(CBuildIndexInfo c_build_index_info,
const uint8_t* serialized_index_params,
const uint64_t len) {
try {
auto build_index_info = (BuildIndexInfo*)c_build_index_info;
auto index_params =
std::make_unique<milvus::proto::indexcgo::IndexParams>();
auto res = index_params->ParseFromArray(serialized_index_params, len);
AssertInfo(res, "Unmarshall index params failed");
for (auto i = 0; i < index_params->params_size(); ++i) {
const auto& param = index_params->params(i);
build_index_info->config[param.key()] = param.value();
}
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
AppendBuildTypeParam(CBuildIndexInfo c_build_index_info,
const uint8_t* serialized_type_params,
const uint64_t len) {
try {
auto build_index_info = (BuildIndexInfo*)c_build_index_info;
auto type_params =
std::make_unique<milvus::proto::indexcgo::TypeParams>();
auto res = type_params->ParseFromArray(serialized_type_params, len);
AssertInfo(res, "Unmarshall index build type params failed");
for (auto i = 0; i < type_params->params_size(); ++i) {
const auto& param = type_params->params(i);
build_index_info->config[param.key()] = param.value();
}
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
AppendFieldMetaInfo(CBuildIndexInfo c_build_index_info,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id,
enum CDataType field_type) {
try {
auto build_index_info = (BuildIndexInfo*)c_build_index_info;
build_index_info->collection_id = collection_id;
build_index_info->partition_id = partition_id;
build_index_info->segment_id = segment_id;
build_index_info->field_id = field_id;
build_index_info->field_type = milvus::DataType(field_type);
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
AppendIndexMetaInfo(CBuildIndexInfo c_build_index_info,
int64_t index_id,
int64_t build_id,
int64_t version) {
try {
auto build_index_info = (BuildIndexInfo*)c_build_index_info;
build_index_info->index_id = index_id;
build_index_info->index_build_id = build_id;
build_index_info->index_version = version;
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
AppendInsertFilePath(CBuildIndexInfo c_build_index_info,
const char* c_file_path) {
try {
auto build_index_info = (BuildIndexInfo*)c_build_index_info;
std::string insert_file_path(c_file_path);
build_index_info->insert_files.emplace_back(insert_file_path);
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
SerializeIndexAndUpLoad(CIndex index, CBinarySet* c_binary_set) {
auto status = CStatus();
try {
AssertInfo(
index,
"failed to serialize index to binary set, passed index was null");
auto real_index =
reinterpret_cast<milvus::indexbuilder::IndexCreatorBase*>(index);
auto binary =
std::make_unique<knowhere::BinarySet>(real_index->Upload());
*c_binary_set = binary.release();
status.error_code = Success;
status.error_msg = "";
} catch (std::exception& e) {
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
}
return status;
}

View File

@ -24,8 +24,7 @@ CStatus
CreateIndex(enum CDataType dtype,
const char* serialized_type_params,
const char* serialized_index_params,
CIndex* res_index,
CStorageConfig storage_config);
CIndex* res_index);
CStatus
DeleteIndex(CIndex index);
@ -53,6 +52,46 @@ LoadIndexFromBinarySet(CIndex index, CBinarySet c_binary_set);
CStatus
CleanLocalData(CIndex index);
CStatus
NewBuildIndexInfo(CBuildIndexInfo* c_build_index_info,
CStorageConfig c_storage_config);
void
DeleteBuildIndexInfo(CBuildIndexInfo c_build_index_info);
CStatus
AppendBuildIndexParam(CBuildIndexInfo c_build_index_info,
const uint8_t* serialized_type_params,
const uint64_t len);
CStatus
AppendBuildTypeParam(CBuildIndexInfo c_build_index_info,
const uint8_t* serialized_type_params,
const uint64_t len);
CStatus
AppendFieldMetaInfo(CBuildIndexInfo c_build_index_info,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id,
enum CDataType field_type);
CStatus
AppendIndexMetaInfo(CBuildIndexInfo c_build_index_info,
int64_t index_id,
int64_t build_id,
int64_t version);
CStatus
AppendInsertFilePath(CBuildIndexInfo c_build_index_info, const char* file_path);
CStatus
CreateIndexV2(CIndex* res_index, CBuildIndexInfo c_build_index_info);
CStatus
SerializeIndexAndUpLoad(CIndex index, CBinarySet* c_binary_set);
#ifdef __cplusplus
};
#endif

View File

@ -15,3 +15,4 @@
typedef void* CIndex;
typedef void* CIndexQueryResult;
typedef void* CBuildIndexInfo;

View File

@ -14,35 +14,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <string>
#include <vector>
#include "common/Types.h"
#include "index/Index.h"
#include "storage/Types.h"
#include "storage/FieldData.h"
namespace milvus::storage {
class FieldDataFactory {
private:
FieldDataFactory() = default;
FieldDataFactory(const FieldDataFactory&) = delete;
FieldDataFactory
operator=(const FieldDataFactory&) = delete;
public:
static FieldDataFactory&
GetInstance() {
static FieldDataFactory inst;
return inst;
}
std::string
GetName() const {
return "FieldDataFactory";
}
FieldDataPtr
CreateFieldData(const DataType& type, const int64_t dim = 1);
struct BuildIndexInfo {
int64_t collection_id;
int64_t partition_id;
int64_t segment_id;
int64_t field_id;
milvus::DataType field_type;
int64_t index_id;
int64_t index_build_id;
int64_t index_version;
std::vector<std::string> insert_files;
milvus::storage::StorageConfig storage_config;
milvus::Config config;
};
} // namespace milvus::storage

View File

@ -1,14 +1,18 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <sys/mman.h>
@ -21,17 +25,9 @@
#include <string>
#include <utility>
#include "common/FieldMeta.h"
#include "common/LoadInfo.h"
#include "common/Span.h"
#include "common/Types.h"
#include "common/Utils.h"
#include "exceptions/EasyAssert.h"
#include "fmt/core.h"
#include "log/Log.h"
#include "nlohmann/json.hpp"
#include "mmap/Utils.h"
namespace milvus::segcore {
namespace milvus {
struct Entry {
char* data;
@ -79,7 +75,7 @@ class Column : public ColumnBase {
public:
Column(int64_t segment_id,
const FieldMeta& field_meta,
const LoadFieldDataInfo& info) {
const FieldDataInfo& info) {
data_ = static_cast<char*>(CreateMap(segment_id, field_meta, info));
size_ = field_meta.get_sizeof() * info.row_count;
row_count_ = info.row_count;
@ -109,20 +105,13 @@ class VariableColumn : public ColumnBase {
VariableColumn(int64_t segment_id,
const FieldMeta& field_meta,
const LoadFieldDataInfo& info) {
auto begin = FIELD_DATA(info.field_data, string).begin();
auto end = FIELD_DATA(info.field_data, string).end();
if constexpr (std::is_same_v<T, Json>) {
begin = FIELD_DATA(info.field_data, json).begin();
end = FIELD_DATA(info.field_data, json).end();
}
size_ = 0;
const FieldDataInfo& info) {
indices_.reserve(info.row_count);
while (begin != end) {
indices_.push_back(size_);
size_ += begin->length();
begin++;
for (auto data : info.datas) {
for (ssize_t idx = 0; idx < data->get_num_rows(); ++idx) {
indices_.emplace_back(size_);
size_ += data->Size(idx);
}
}
data_ = static_cast<char*>(CreateMap(segment_id, field_meta, info));
@ -177,4 +166,4 @@ class VariableColumn : public ColumnBase {
// Compatible with current Span type
std::vector<ViewType> views_{};
};
} // namespace milvus::segcore
} // namespace milvus

View File

@ -13,17 +13,19 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unistd.h>
#include <string>
#include <vector>
#include "storage/FieldData.h"
namespace milvus::ChunkMangerConfig {
namespace milvus {
void
SetLocalRootPath(const std::string_view path_prefix);
std::string
GetLocalRootPath();
} // namespace milvus::ChunkMangerConfig
struct FieldDataInfo {
int64_t field_id;
int64_t row_count;
std::vector<storage::FieldDataPtr> datas;
std::string mmap_dir_path;
};
} // namespace milvus

View File

@ -0,0 +1,232 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <cstring>
#include <filesystem>
#include <memory>
#include <string>
#include <vector>
#include "common/FieldMeta.h"
#include "mmap/Types.h"
#include "storage/Util.h"
namespace milvus {
inline size_t
GetDataSize(const std::vector<storage::FieldDataPtr>& datas) {
size_t total_size{0};
for (auto data : datas) {
total_size += data->Size();
}
return total_size;
}
inline void*
FillField(DataType data_type, const storage::FieldDataPtr data, void* dst) {
char* dest = reinterpret_cast<char*>(dst);
if (datatype_is_variable(data_type)) {
switch (data_type) {
case DataType::STRING:
case DataType::VARCHAR: {
for (ssize_t i = 0; i < data->get_num_rows(); ++i) {
auto str =
static_cast<const std::string*>(data->RawValue(i));
memcpy(dest, str->data(), str->size());
dest += str->size();
}
break;
}
case DataType::JSON: {
for (ssize_t i = 0; i < data->get_num_rows(); ++i) {
auto padded_string =
static_cast<const Json*>(data->RawValue(i))->data();
memcpy(dest, padded_string.data(), padded_string.size());
dest += padded_string.size();
}
break;
}
default:
PanicInfo(fmt::format("not supported data type {}",
datatype_name(data_type)));
}
} else {
memcpy(dst, data->Data(), data->Size());
dest += data->Size();
}
return dest;
}
inline ssize_t
WriteFieldData(int fd, DataType data_type, const storage::FieldDataPtr data) {
ssize_t total_written{0};
if (datatype_is_variable(data_type)) {
switch (data_type) {
case DataType::VARCHAR:
case DataType::STRING: {
for (ssize_t i = 0; i < data->get_num_rows(); ++i) {
auto str =
static_cast<const std::string*>(data->RawValue(i));
ssize_t written = write(fd, str->data(), str->size());
if (written < str->size()) {
break;
}
total_written += written;
}
break;
}
case DataType::JSON: {
for (ssize_t i = 0; i < data->get_num_rows(); ++i) {
auto padded_string =
static_cast<const Json*>(data->RawValue(i))->data();
ssize_t written =
write(fd, padded_string.data(), padded_string.size());
if (written < padded_string.size()) {
break;
}
total_written += written;
}
break;
}
default:
PanicInfo(fmt::format("not supported data type {}",
datatype_name(data_type)));
}
} else {
total_written += write(fd, data->Data(), data->Size());
}
return total_written;
}
// CreateMap creates a memory mapping,
// if mmap enabled, this writes field data to disk and create a map to the file,
// otherwise this just alloc memory
inline void*
CreateMap(int64_t segment_id,
const FieldMeta& field_meta,
const FieldDataInfo& info) {
static int mmap_flags = MAP_PRIVATE;
#ifdef MAP_POPULATE
// macOS doesn't support MAP_POPULATE
mmap_flags |= MAP_POPULATE;
#endif
// simdjson requires a padding following the json data
size_t padding = field_meta.get_data_type() == DataType::JSON
? simdjson::SIMDJSON_PADDING
: 0;
auto data_size = GetDataSize(info.datas);
// Allocate memory
if (info.mmap_dir_path.empty()) {
auto data_type = field_meta.get_data_type();
if (data_size == 0)
return nullptr;
// Use anon mapping so we are able to free these memory with munmap only
void* map = mmap(nullptr,
data_size + padding,
PROT_READ | PROT_WRITE,
mmap_flags | MAP_ANON,
-1,
0);
AssertInfo(
map != MAP_FAILED,
fmt::format("failed to create anon map, err: {}", strerror(errno)));
auto dst = map;
for (auto data : info.datas) {
dst = FillField(data_type, data, dst);
}
return map;
}
auto filepath = std::filesystem::path(info.mmap_dir_path) /
std::to_string(segment_id) / std::to_string(info.field_id);
auto dir = filepath.parent_path();
std::filesystem::create_directories(dir);
int fd =
open(filepath.c_str(), O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR);
AssertInfo(fd != -1,
fmt::format("failed to create mmap file {}", filepath.c_str()));
auto data_type = field_meta.get_data_type();
ssize_t total_written{0};
for (auto data : info.datas) {
auto written = WriteFieldData(fd, data_type, data);
if (written != data->Size()) {
break;
}
total_written += written;
}
AssertInfo(
total_written == data_size ||
total_written != -1 &&
datatype_is_variable(field_meta.get_data_type()),
fmt::format(
"failed to write data file {}, written {} but total {}, err: {}",
filepath.c_str(),
total_written,
data_size,
strerror(errno)));
int ok = fsync(fd);
AssertInfo(ok == 0,
fmt::format("failed to fsync mmap data file {}, err: {}",
filepath.c_str(),
strerror(errno)));
// Empty field
if (total_written == 0) {
return nullptr;
}
auto map =
mmap(nullptr, total_written + padding, PROT_READ, mmap_flags, fd, 0);
AssertInfo(map != MAP_FAILED,
fmt::format("failed to create map for data file {}, err: {}",
filepath.c_str(),
strerror(errno)));
#ifndef MAP_POPULATE
// Manually access the mapping to populate it
const size_t page_size = getpagesize();
char* begin = (char*)map;
char* end = begin + total_written;
for (char* page = begin; page < end; page += page_size) {
char value = page[0];
}
#endif
// unlink this data file so
// then it will be auto removed after we don't need it again
ok = unlink(filepath.c_str());
AssertInfo(ok == 0,
fmt::format("failed to unlink mmap data file {}, err: {}",
filepath.c_str(),
strerror(errno)));
ok = close(fd);
AssertInfo(ok == 0,
fmt::format("failed to close data file {}, err: {}",
filepath.c_str(),
strerror(errno)));
return map;
}
} // namespace milvus

View File

@ -30,6 +30,7 @@ set(SEGCORE_FILES
plan_c.cpp
reduce_c.cpp
load_index_c.cpp
load_field_data_c.cpp
SegmentInterface.cpp
SegcoreConfig.cpp
IndexConfigGenerator.cpp

View File

@ -91,80 +91,4 @@ VectorBase::set_data_raw(ssize_t element_offset,
}
}
void
VectorBase::fill_chunk_data(ssize_t element_count,
const DataArray* data,
const FieldMeta& field_meta) {
if (field_meta.is_vector()) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
return fill_chunk_data(VEC_FIELD_DATA(data, float).data(),
element_count);
} else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
return fill_chunk_data(VEC_FIELD_DATA(data, binary), element_count);
} else {
PanicInfo("unsupported");
}
}
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
return fill_chunk_data(FIELD_DATA(data, bool).data(),
element_count);
}
case DataType::INT8: {
auto& src_data = FIELD_DATA(data, int);
std::vector<int8_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return fill_chunk_data(data_raw.data(), element_count);
}
case DataType::INT16: {
auto& src_data = FIELD_DATA(data, int);
std::vector<int16_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return fill_chunk_data(data_raw.data(), element_count);
}
case DataType::INT32: {
return fill_chunk_data(FIELD_DATA(data, int).data(), element_count);
}
case DataType::INT64: {
return fill_chunk_data(FIELD_DATA(data, long).data(),
element_count);
}
case DataType::FLOAT: {
return fill_chunk_data(FIELD_DATA(data, float).data(),
element_count);
}
case DataType::DOUBLE: {
return fill_chunk_data(FIELD_DATA(data, double).data(),
element_count);
}
case DataType::VARCHAR: {
auto vec = static_cast<ConcurrentVector<std::string>*>(this);
auto count = FIELD_DATA(data, string).size();
vec->grow_on_demand(count);
auto& chunk = vec->get_chunk(0);
size_t index = 0;
for (auto& str : FIELD_DATA(data, string)) {
chunk[index++] = str;
}
return;
}
case DataType::JSON: {
auto vec = static_cast<ConcurrentVector<Json>*>(this);
auto count = FIELD_DATA(data, json).size();
vec->grow_on_demand(count);
auto& chunk = vec->get_chunk(0);
size_t index = 0;
for (auto& str : FIELD_DATA(data, json)) {
chunk[index++] = Json(simdjson::padded_string(str));
}
return;
}
default: {
PanicInfo("unsupported");
}
}
}
} // namespace milvus::segcore

View File

@ -31,6 +31,7 @@
#include "common/Types.h"
#include "common/Utils.h"
#include "exceptions/EasyAssert.h"
#include "storage/FieldData.h"
namespace milvus::segcore {
@ -100,6 +101,10 @@ class VectorBase {
const void* source,
ssize_t element_count) = 0;
virtual void
set_data_raw(ssize_t element_offset,
const std::vector<storage::FieldDataPtr>& data) = 0;
void
set_data_raw(ssize_t element_offset,
ssize_t element_count,
@ -107,12 +112,7 @@ class VectorBase {
const FieldMeta& field_meta);
virtual void
fill_chunk_data(const void* source, ssize_t element_count) = 0;
void
fill_chunk_data(ssize_t element_count,
const DataArray* data,
const FieldMeta& field_meta);
fill_chunk_data(const std::vector<storage::FieldDataPtr>& data) = 0;
virtual SpanBase
get_span_base(int64_t chunk_id) const = 0;
@ -196,13 +196,32 @@ class ConcurrentVectorImpl : public VectorBase {
}
void
fill_chunk_data(const void* source, ssize_t element_count) override {
if (element_count == 0) {
return;
}
fill_chunk_data(const std::vector<storage::FieldDataPtr>& datas)
override { // used only for sealed segment
AssertInfo(chunks_.size() == 0, "no empty concurrent vector");
int64_t element_count = 0;
for (auto& field_data : datas) {
element_count += field_data->get_num_rows();
}
chunks_.emplace_to_at_least(1, Dim * element_count);
set_data(0, static_cast<const Type*>(source), element_count);
int64_t offset = 0;
for (auto& field_data : datas) {
auto num_rows = field_data->get_num_rows();
set_data(
offset, static_cast<const Type*>(field_data->Data()), num_rows);
offset += num_rows;
}
}
void
set_data_raw(ssize_t element_offset,
const std::vector<storage::FieldDataPtr>& datas) override {
for (auto& field_data : datas) {
auto num_rows = field_data->get_num_rows();
set_data_raw(element_offset, field_data->Data(), num_rows);
element_offset += num_rows;
}
}
void

View File

@ -276,6 +276,27 @@ class IndexingRecord {
}
}
// concurrent, reentrant
template <bool is_sealed>
void
AppendingIndex(int64_t reserved_offset,
int64_t size,
FieldId fieldId,
const storage::FieldDataPtr data,
const InsertRecord<is_sealed>& record) {
if (is_in(fieldId)) {
auto& indexing = field_indexings_.at(fieldId);
if (indexing->get_field_meta().is_vector() &&
indexing->get_field_meta().get_data_type() ==
DataType::VECTOR_FLOAT &&
reserved_offset + size >= indexing->get_build_threshold()) {
auto vec_base = record.get_field_data_base(fieldId);
indexing->AppendSegmentIndex(
reserved_offset, size, vec_base, data->Data());
}
}
}
void
GetDataFromIndex(FieldId fieldId,
const int64_t* seg_offsets,

View File

@ -247,6 +247,37 @@ struct InsertRecord {
return res_offsets;
}
void
insert_pks(const std::vector<storage::FieldDataPtr>& field_datas) {
std::lock_guard lck(shared_mutex_);
int64_t offset = 0;
for (auto& data : field_datas) {
int64_t row_count = data->get_num_rows();
auto data_type = data->get_data_type();
switch (data_type) {
case DataType::INT64: {
for (int i = 0; i < row_count; ++i) {
pk2offset_->insert(
*static_cast<const int64_t*>(data->RawValue(i)),
offset++);
}
break;
}
case DataType::VARCHAR: {
for (int i = 0; i < row_count; ++i) {
pk2offset_->insert(
*static_cast<const std::string*>(data->RawValue(i)),
offset++);
}
break;
}
default: {
PanicInfo("unsupported primary key data type");
}
}
}
}
std::vector<SegOffset>
search_pk(const PkType& pk, int64_t insert_barrier) const {
std::shared_lock lck(shared_mutex_);

View File

@ -23,6 +23,8 @@
#include "query/SearchOnSealed.h"
#include "segcore/SegmentGrowingImpl.h"
#include "segcore/Utils.h"
#include "storage/RemoteChunkManagerSingleton.h"
#include "storage/Util.h"
namespace milvus::segcore {
@ -112,6 +114,77 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
reserved_offset + size);
}
void
SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) {
// schema don't include system field
AssertInfo(infos.field_infos.size() == schema_->size() + 2,
"lost some field data when load for growing segment");
AssertInfo(infos.field_infos.find(TimestampFieldID.get()) !=
infos.field_infos.end(),
"timestamps field data should be included");
AssertInfo(
infos.field_infos.find(RowFieldID.get()) != infos.field_infos.end(),
"rowID field data should be included");
auto primary_field_id =
schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(primary_field_id.get() != INVALID_FIELD_ID, "Primary key is -1");
AssertInfo(infos.field_infos.find(primary_field_id.get()) !=
infos.field_infos.end(),
"primary field data should be included");
int64_t num_rows = 0;
for (auto& field : infos.field_infos) {
num_rows = field.second.row_count;
break;
}
auto reserved_offset = PreInsert(num_rows);
for (auto& [id, info] : infos.field_infos) {
auto field_id = FieldId(id);
auto insert_files = info.insert_files;
auto field_datas = LoadFieldDatasFromRemote(insert_files);
AssertInfo(
num_rows == storage::GetTotalNumRowsForFieldDatas(field_datas),
"inconsistent num row between multi fields");
if (field_id == TimestampFieldID) {
// step 2: sort timestamp
// query node already guarantees that the timestamp is ordered, avoid field data copy in c++
// step 3: fill into Segment.ConcurrentVector
insert_record_.timestamps_.set_data_raw(reserved_offset,
field_datas);
continue;
}
if (field_id == RowFieldID) {
insert_record_.row_ids_.set_data_raw(reserved_offset, field_datas);
continue;
}
if (!indexing_record_.SyncDataWithIndex(field_id)) {
insert_record_.get_field_data_base(field_id)->set_data_raw(
reserved_offset, field_datas);
}
if (segcore_config_.get_enable_growing_segment_index()) {
auto offset = reserved_offset;
for (auto data : field_datas) {
auto row_count = data->get_num_rows();
indexing_record_.AppendingIndex(
offset, row_count, field_id, data, insert_record_);
offset += row_count;
}
}
if (field_id == primary_field_id) {
insert_record_.insert_pks(field_datas);
}
}
// step 5: update small indexes
insert_record_.ack_responder_.AddSegment(reserved_offset,
reserved_offset + num_rows);
}
Status
SegmentGrowingImpl::Delete(int64_t reserved_begin,
int64_t size,

View File

@ -62,6 +62,9 @@ class SegmentGrowingImpl : public SegmentGrowing {
void
LoadDeletedRecord(const LoadDeletedRecordInfo& info) override;
void
LoadFieldData(const LoadFieldDataInfo& info) override;
std::string
debug() const override;

View File

@ -83,6 +83,9 @@ class SegmentInterface {
virtual void
LoadDeletedRecord(const LoadDeletedRecordInfo& info) = 0;
virtual void
LoadFieldData(const LoadFieldDataInfo& info) = 0;
virtual int64_t
get_segment_id() const = 0;

View File

@ -18,6 +18,7 @@
#include "pb/segcore.pb.h"
#include "segcore/SegmentInterface.h"
#include "segcore/Types.h"
#include "mmap/Column.h"
namespace milvus::segcore {
@ -28,11 +29,11 @@ class SegmentSealed : public SegmentInternalInterface {
virtual void
LoadSegmentMeta(const milvus::proto::segcore::LoadSegmentMeta& meta) = 0;
virtual void
LoadFieldData(const LoadFieldDataInfo& info) = 0;
virtual void
DropIndex(const FieldId field_id) = 0;
virtual void
DropFieldData(const FieldId field_id) = 0;
virtual void
LoadFieldData(FieldId field_id, const FieldDataInfo& data_info) = 0;
SegmentType
type() const override {

View File

@ -21,7 +21,7 @@
#include "Utils.h"
#include "Types.h"
#include "common/Column.h"
#include "mmap/Column.h"
#include "common/Consts.h"
#include "common/FieldMeta.h"
#include "common/Types.h"
@ -29,7 +29,7 @@
#include "query/ScalarIndex.h"
#include "query/SearchBruteForce.h"
#include "query/SearchOnSealed.h"
#include "index/Utils.h"
#include "storage/Util.h"
namespace milvus::segcore {
@ -166,52 +166,73 @@ SegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
}
void
SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& load_info) {
// print(info);
// NOTE: lock only when data is ready to avoid starvation
// only one field for now, parallel load field data in golang
for (auto& [id, info] : load_info.field_infos) {
AssertInfo(info.row_count > 0, "The row count of field data is 0");
auto field_id = FieldId(info.field_id);
AssertInfo(info.field_data != nullptr, "Field info blob is null");
auto size = info.row_count;
auto field_id = FieldId(id);
auto insert_files = info.insert_files;
auto field_datas = LoadFieldDatasFromRemote(insert_files);
int64_t num_rows = storage::GetTotalNumRowsForFieldDatas(field_datas);
AssertInfo(num_rows == info.row_count,
"inconsistent field data row count with meta");
auto field_data_info = FieldDataInfo{
field_id.get(), num_rows, field_datas, load_info.mmap_dir_path};
LoadFieldData(field_id, field_data_info);
}
}
void
SegmentSealedImpl::LoadFieldData(FieldId field_id,
const FieldDataInfo& data_info) {
auto num_rows = data_info.row_count;
if (row_count_opt_.has_value()) {
AssertInfo(
row_count_opt_.value() == size,
fmt::format(
"field {} has different row count {} to other column's {}",
field_id.get(),
size,
row_count_opt_.value()));
AssertInfo(row_count_opt_.value() == num_rows,
"field (" + std::to_string(field_id.get()) +
") data has different row count (" +
std::to_string(num_rows) +
") than other column's row count (" +
std::to_string(row_count_opt_.value()) + ")");
}
if (SystemProperty::Instance().IsSystem(field_id)) {
auto system_field_type =
SystemProperty::Instance().GetSystemFieldType(field_id);
if (system_field_type == SystemFieldType::Timestamp) {
auto timestamps = reinterpret_cast<const Timestamp*>(
FIELD_DATA(info.field_data, long).data());
std::vector<Timestamp> timestamps(num_rows);
int64_t offset = 0;
for (auto& data : data_info.datas) {
int64_t row_count = data->get_num_rows();
std::copy_n(static_cast<const Timestamp*>(data->Data()),
row_count,
timestamps.data() + offset);
offset += row_count;
}
TimestampIndex index;
auto min_slice_length = size < 4096 ? 1 : 4096;
auto meta = GenerateFakeSlices(timestamps, size, min_slice_length);
auto min_slice_length = num_rows < 4096 ? 1 : 4096;
auto meta = GenerateFakeSlices(
timestamps.data(), num_rows, min_slice_length);
index.set_length_meta(std::move(meta));
index.build_with(timestamps, size);
// todo ::opt to avoid copy timestamps from field data
index.build_with(timestamps.data(), num_rows);
// use special index
std::unique_lock lck(mutex_);
AssertInfo(insert_record_.timestamps_.empty(), "already exists");
insert_record_.timestamps_.fill_chunk_data(timestamps, size);
insert_record_.timestamps_.fill_chunk_data(data_info.datas);
insert_record_.timestamp_index_ = std::move(index);
AssertInfo(insert_record_.timestamps_.num_chunk() == 1,
"num chunk not equal to 1 for sealed segment");
} else {
AssertInfo(system_field_type == SystemFieldType::RowId,
"System field type of id column is not RowId");
auto row_ids = reinterpret_cast<const idx_t*>(
FIELD_DATA(info.field_data, long).data());
// write data under lock
std::unique_lock lck(mutex_);
AssertInfo(insert_record_.row_ids_.empty(), "already exists");
insert_record_.row_ids_.fill_chunk_data(row_ids, size);
insert_record_.row_ids_.fill_chunk_data(data_info.datas);
AssertInfo(insert_record_.row_ids_.num_chunk() == 1,
"num chunk not equal to 1 for sealed segment");
}
@ -220,36 +241,33 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
// prepare data
auto& field_meta = (*schema_)[field_id];
auto data_type = field_meta.get_data_type();
AssertInfo(data_type == DataType(info.field_data->type()),
"field type of load data is inconsistent with the schema");
// Don't allow raw data and index exist at the same time
AssertInfo(!get_bit(index_ready_bitset_, field_id),
"field data can't be loaded when indexing exists");
size_t size = 0;
if (datatype_is_variable(data_type)) {
std::unique_ptr<ColumnBase> column{};
switch (data_type) {
case milvus::DataType::STRING:
case milvus::DataType::VARCHAR: {
column = std::make_unique<VariableColumn<std::string>>(
get_segment_id(), field_meta, info);
get_segment_id(), field_meta, data_info);
break;
}
case milvus::DataType::JSON: {
column = std::make_unique<VariableColumn<Json>>(
get_segment_id(), field_meta, info);
get_segment_id(), field_meta, data_info);
}
default: {
}
}
size = column->size();
std::unique_lock lck(mutex_);
variable_fields_.emplace(field_id, std::move(column));
} else {
auto column = Column(get_segment_id(), field_meta, info);
size = column.size();
auto column = Column(get_segment_id(), field_meta, data_info);
std::unique_lock lck(mutex_);
fixed_fields_.emplace(field_id, std::move(column));
}
@ -258,19 +276,15 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
if (schema_->get_primary_field_id() == field_id) {
AssertInfo(field_id.get() != -1, "Primary key is -1");
AssertInfo(insert_record_.empty_pks(), "already exists");
std::vector<PkType> pks(info.row_count);
ParsePksFromFieldData(pks, *info.field_data);
for (int i = 0; i < info.row_count; ++i) {
insert_record_.insert_pk(pks[i], i);
}
insert_record_.insert_pks(data_info.datas);
insert_record_.seal_pks();
}
std::unique_lock lck(mutex_);
set_bit(field_data_ready_bitset_, field_id, true);
}
std::unique_lock lck(mutex_);
update_row_count(info.row_count);
update_row_count(num_rows);
}
void

View File

@ -28,7 +28,7 @@
#include "SealedIndexingRecord.h"
#include "SegmentSealed.h"
#include "TimestampIndex.h"
#include "common/Column.h"
#include "mmap/Column.h"
#include "index/ScalarIndex.h"
#include "sys/mman.h"
@ -55,6 +55,8 @@ class SegmentSealedImpl : public SegmentSealed {
HasIndex(FieldId field_id) const override;
bool
HasFieldData(FieldId field_id) const override;
void
LoadFieldData(FieldId field_id, const FieldDataInfo& data_info) override;
int64_t
get_segment_id() const override {

View File

@ -41,7 +41,6 @@ struct LoadIndexInfo {
std::map<std::string, std::string> index_params;
std::vector<std::string> index_files;
index::IndexBasePtr index;
storage::StorageConfig storage_config;
};
} // namespace milvus::segcore

View File

@ -12,8 +12,11 @@
#include "segcore/Utils.h"
#include <string>
#include "common/Utils.h"
#include "index/ScalarIndex.h"
#include "storage/RemoteChunkManagerSingleton.h"
#include "common/Common.h"
#include "storage/Util.h"
#include "mmap/Utils.h"
namespace milvus::segcore {
@ -37,6 +40,37 @@ ParsePksFromFieldData(std::vector<PkType>& pks, const DataArray& data) {
}
}
void
ParsePksFromFieldData(DataType data_type,
std::vector<PkType>& pks,
const std::vector<storage::FieldDataPtr>& datas) {
int64_t offset = 0;
for (auto& field_data : datas) {
AssertInfo(data_type == field_data->get_data_type(),
"inconsistent data type when parse pk from field data");
int64_t row_count = field_data->get_num_rows();
switch (data_type) {
case DataType::INT64: {
std::copy_n(static_cast<const int64_t*>(field_data->Data()),
row_count,
pks.data() + offset);
break;
}
case DataType::VARCHAR: {
std::copy_n(static_cast<const std::string*>(field_data->Data()),
row_count,
pks.data() + offset);
break;
}
default: {
PanicInfo("unsupported");
}
}
offset += row_count;
}
}
void
ParsePksFromIDs(std::vector<PkType>& pks,
DataType data_type,
@ -509,5 +543,47 @@ ReverseDataFromIndex(const index::IndexBase* index,
return data_array;
}
// init segcore storage config first, and create default remote chunk manager
// segcore use default remote chunk manager to load data from minio/s3
std::vector<storage::FieldDataPtr>
LoadFieldDatasFromRemote(std::vector<std::string>& remote_files) {
auto rcm = storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
std::sort(remote_files.begin(),
remote_files.end(),
[](const std::string& a, const std::string& b) {
return std::stol(a.substr(a.find_last_of("/") + 1)) <
std::stol(b.substr(b.find_last_of("/") + 1));
});
auto parallel_degree =
uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE);
std::vector<std::string> batch_files;
std::vector<storage::FieldDataPtr> field_datas;
auto FetchRawData = [&]() {
auto raw_datas = GetObjectData(rcm.get(), batch_files);
for (auto& data : raw_datas) {
field_datas.emplace_back(data);
}
};
for (auto& file : remote_files) {
if (batch_files.size() >= parallel_degree) {
FetchRawData();
batch_files.clear();
}
batch_files.emplace_back(file);
}
if (batch_files.size() > 0) {
FetchRawData();
}
AssertInfo(field_datas.size() == remote_files.size(),
"inconsistent file num and raw data num!");
return field_datas;
}
} // namespace milvus::segcore

View File

@ -28,6 +28,11 @@ namespace milvus::segcore {
void
ParsePksFromFieldData(std::vector<PkType>& pks, const DataArray& data);
void
ParsePksFromFieldData(DataType data_type,
std::vector<PkType>& pks,
const std::vector<storage::FieldDataPtr>& datas);
void
ParsePksFromIDs(std::vector<PkType>& pks,
DataType data_type,
@ -141,4 +146,7 @@ ReverseDataFromIndex(const index::IndexBase* index,
int64_t count,
const FieldMeta& field_meta);
std::vector<storage::FieldDataPtr>
LoadFieldDatasFromRemote(std::vector<std::string>& remote_files);
} // namespace milvus::segcore

View File

@ -0,0 +1,83 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "common/CGoHelper.h"
#include "common/LoadInfo.h"
#include "segcore/load_field_data_c.h"
CStatus
NewLoadFieldDataInfo(CLoadFieldDataInfo* c_load_field_data_info) {
try {
auto load_field_data_info = std::make_unique<LoadFieldDataInfo>();
*c_load_field_data_info = load_field_data_info.release();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
}
}
void
DeleteLoadFieldDataInfo(CLoadFieldDataInfo c_load_field_data_info) {
auto info = (LoadFieldDataInfo*)c_load_field_data_info;
delete info;
}
CStatus
AppendLoadFieldInfo(CLoadFieldDataInfo c_load_field_data_info,
int64_t field_id,
int64_t row_count) {
try {
auto load_field_data_info = (LoadFieldDataInfo*)c_load_field_data_info;
auto iter = load_field_data_info->field_infos.find(field_id);
if (iter != load_field_data_info->field_infos.end()) {
throw std::runtime_error("append same field info multi times");
}
FieldBinlogInfo binlog_info;
binlog_info.field_id = field_id;
binlog_info.row_count = row_count;
load_field_data_info->field_infos[field_id] = binlog_info;
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
}
}
CStatus
AppendLoadFieldDataPath(CLoadFieldDataInfo c_load_field_data_info,
int64_t field_id,
const char* c_file_path) {
try {
auto load_field_data_info = (LoadFieldDataInfo*)c_load_field_data_info;
auto iter = load_field_data_info->field_infos.find(field_id);
std::string file_path(c_file_path);
if (iter == load_field_data_info->field_infos.end()) {
throw std::runtime_error("please append field info first");
}
load_field_data_info->field_infos[field_id].insert_files.emplace_back(
file_path);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
}
}
void
AppendMMapDirPath(CLoadFieldDataInfo c_load_field_data_info,
const char* c_dir_path) {
auto load_field_data_info = (LoadFieldDataInfo*)c_load_field_data_info;
load_field_data_info->mmap_dir_path = std::string(c_dir_path);
}

View File

@ -0,0 +1,50 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
#include "common/type_c.h"
typedef void* CLoadFieldDataInfo;
CStatus
NewLoadFieldDataInfo(CLoadFieldDataInfo* c_load_field_data_info);
void
DeleteLoadFieldDataInfo(CLoadFieldDataInfo c_load_field_data_info);
CStatus
AppendLoadFieldInfo(CLoadFieldDataInfo c_load_field_data_info,
int64_t field_id,
int64_t row_count);
CStatus
AppendLoadFieldDataPath(CLoadFieldDataInfo c_load_field_data_info,
int64_t field_id,
const char* file_path);
void
AppendMMapDirPath(CLoadFieldDataInfo c_load_field_data_info,
const char* dir_path);
#ifdef __cplusplus
}
#endif

View File

@ -11,36 +11,20 @@
#include "segcore/load_index_c.h"
#include "common/CDataType.h"
#include "common/FieldMeta.h"
#include "common/Utils.h"
#include "index/IndexFactory.h"
#include "index/Meta.h"
#include "index/Utils.h"
#include "segcore/Types.h"
#include "storage/Util.h"
#include "storage/RemoteChunkManagerSingleton.h"
#include "storage/LocalChunkManagerSingleton.h"
CStatus
NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info,
CStorageConfig c_storage_config) {
NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info) {
try {
auto load_index_info =
std::make_unique<milvus::segcore::LoadIndexInfo>();
auto& storage_config = load_index_info->storage_config;
storage_config.address = std::string(c_storage_config.address);
storage_config.bucket_name = std::string(c_storage_config.bucket_name);
storage_config.access_key_id =
std::string(c_storage_config.access_key_id);
storage_config.access_key_value =
std::string(c_storage_config.access_key_value);
storage_config.remote_root_path =
std::string(c_storage_config.remote_root_path);
storage_config.storage_type =
std::string(c_storage_config.storage_type);
storage_config.iam_endpoint =
std::string(c_storage_config.iam_endpoint);
storage_config.useSSL = c_storage_config.useSSL;
storage_config.useIAM = c_storage_config.useIAM;
*c_load_index_info = load_index_info.release();
auto status = CStatus();
@ -143,11 +127,15 @@ appendVecIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) {
load_index_info->field_id,
load_index_info->index_build_id,
load_index_info->index_version};
auto remote_chunk_manager =
milvus::storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
auto file_manager =
milvus::storage::CreateFileManager(index_info.index_type,
field_meta,
index_meta,
load_index_info->storage_config);
remote_chunk_manager);
AssertInfo(file_manager != nullptr, "create file manager failed!");
auto config = milvus::index::ParseConfigFromIndexParams(
load_index_info->index_params);
@ -212,6 +200,69 @@ AppendIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) {
return appendScalarIndex(c_load_index_info, c_binary_set);
}
CStatus
AppendIndexV2(CLoadIndexInfo c_load_index_info) {
try {
auto load_index_info =
(milvus::segcore::LoadIndexInfo*)c_load_index_info;
auto& index_params = load_index_info->index_params;
auto field_type = load_index_info->field_type;
milvus::index::CreateIndexInfo index_info;
index_info.field_type = load_index_info->field_type;
// get index type
AssertInfo(index_params.find("index_type") != index_params.end(),
"index type is empty");
index_info.index_type = index_params.at("index_type");
// get metric type
if (milvus::datatype_is_vector(field_type)) {
AssertInfo(index_params.find("metric_type") != index_params.end(),
"metric type is empty for vector index");
index_info.metric_type = index_params.at("metric_type");
}
// init file manager
milvus::storage::FieldDataMeta field_meta{
load_index_info->collection_id,
load_index_info->partition_id,
load_index_info->segment_id,
load_index_info->field_id};
milvus::storage::IndexMeta index_meta{load_index_info->segment_id,
load_index_info->field_id,
load_index_info->index_build_id,
load_index_info->index_version};
auto remote_chunk_manager =
milvus::storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
auto file_manager =
milvus::storage::CreateFileManager(index_info.index_type,
field_meta,
index_meta,
remote_chunk_manager);
AssertInfo(file_manager != nullptr, "create file manager failed!");
auto config = milvus::index::ParseConfigFromIndexParams(
load_index_info->index_params);
config["index_files"] = load_index_info->index_files;
load_index_info->index =
milvus::index::IndexFactory::GetInstance().CreateIndex(
index_info, file_manager);
load_index_info->index->Load(config);
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
AppendIndexFilePath(CLoadIndexInfo c_load_index_info, const char* c_file_path) {
try {
@ -261,12 +312,14 @@ CleanLoadedIndex(CLoadIndexInfo c_load_index_info) {
try {
auto load_index_info =
(milvus::segcore::LoadIndexInfo*)c_load_index_info;
auto index_file_path_prefix = milvus::storage::GenLocalIndexPathPrefix(
load_index_info->index_build_id, load_index_info->index_version);
#ifdef BUILD_DISK_ANN
milvus::storage::LocalChunkManager::GetInstance().RemoveDir(
index_file_path_prefix);
#endif
auto local_chunk_manager =
milvus::storage::LocalChunkManagerSingleton::GetInstance()
.GetChunkManager();
auto index_file_path_prefix =
milvus::storage::GenIndexPathPrefix(local_chunk_manager,
load_index_info->index_build_id,
load_index_info->index_version);
local_chunk_manager->RemoveDir(index_file_path_prefix);
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";

View File

@ -24,8 +24,7 @@ extern "C" {
typedef void* CLoadIndexInfo;
CStatus
NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info,
CStorageConfig c_storage_config);
NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info);
void
DeleteLoadIndexInfo(CLoadIndexInfo c_load_index_info);
@ -55,6 +54,9 @@ AppendIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set);
CStatus
AppendIndexFilePath(CLoadIndexInfo c_load_index_info, const char* file_path);
CStatus
AppendIndexV2(CLoadIndexInfo c_load_index_info);
CStatus
CleanLoadedIndex(CLoadIndexInfo c_load_index_info);

View File

@ -17,12 +17,12 @@
#include "common/Tracer.h"
#include "common/type_c.h"
#include "google/protobuf/text_format.h"
#include "index/IndexInfo.h"
#include "log/Log.h"
#include "segcore/Collection.h"
#include "segcore/SegmentGrowingImpl.h"
#include "segcore/SegmentSealedImpl.h"
#include "segcore/SegcoreConfig.h"
#include "storage/Util.h"
#include "mmap/Types.h"
////////////////////////////// common interfaces //////////////////////////////
CSegmentInterface
@ -228,22 +228,51 @@ Delete(CSegmentInterface c_segment,
////////////////////////////// interfaces for sealed segment //////////////////////////////
CStatus
LoadFieldData(CSegmentInterface c_segment,
CLoadFieldDataInfo load_field_data_info) {
CLoadFieldDataInfo c_load_field_data_info) {
try {
auto segment =
reinterpret_cast<milvus::segcore::SegmentInterface*>(c_segment);
AssertInfo(segment != nullptr, "segment conversion failed");
auto load_info = (LoadFieldDataInfo*)c_load_field_data_info;
segment->LoadFieldData(*load_info);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
}
}
// just for test
CStatus
LoadFieldRawData(CSegmentInterface c_segment,
int64_t field_id,
const void* data,
int64_t row_count) {
try {
auto segment_interface =
reinterpret_cast<milvus::segcore::SegmentInterface*>(c_segment);
auto segment =
dynamic_cast<milvus::segcore::SegmentSealed*>(segment_interface);
AssertInfo(segment != nullptr, "segment conversion failed");
auto field_data = std::make_unique<milvus::DataArray>();
auto suc = field_data->ParseFromArray(load_field_data_info.blob,
load_field_data_info.blob_size);
AssertInfo(suc, "unmarshal field data string failed");
auto load_info = LoadFieldDataInfo{load_field_data_info.field_id,
field_data.get(),
load_field_data_info.row_count,
load_field_data_info.mmap_dir_path};
segment->LoadFieldData(load_info);
milvus::DataType data_type;
int64_t dim = 1;
if (milvus::SystemProperty::Instance().IsSystem(
milvus::FieldId(field_id))) {
data_type = milvus::DataType::INT64;
} else {
auto field_meta = segment->get_schema()[milvus::FieldId(field_id)];
data_type = field_meta.get_data_type();
if (milvus::datatype_is_vector(data_type)) {
dim = field_meta.get_dim();
}
}
auto field_data = milvus::storage::CreateFieldData(data_type, dim);
field_data->FillFieldData(data, row_count);
auto field_data_info = milvus::FieldDataInfo{
field_id,
row_count,
std::vector<milvus::storage::FieldDataPtr>{field_data}};
segment->LoadFieldData(milvus::FieldId(field_id), field_data_info);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());

View File

@ -22,6 +22,7 @@ extern "C" {
#include "common/type_c.h"
#include "segcore/plan_c.h"
#include "segcore/load_index_c.h"
#include "segcore/load_field_data_c.h"
typedef void* CSegmentInterface;
typedef void* CSearchResult;
@ -88,6 +89,12 @@ CStatus
LoadFieldData(CSegmentInterface c_segment,
CLoadFieldDataInfo load_field_data_info);
CStatus
LoadFieldRawData(CSegmentInterface c_segment,
int64_t field_id,
const void* data,
int64_t row_count);
CStatus
LoadDeletedRecord(CSegmentInterface c_segment,
CLoadDeletedRecordInfo deleted_record_info);

View File

@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.

View File

@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.

View File

@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.

View File

@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.

View File

@ -37,8 +37,8 @@ BinlogReader::Read(int64_t nbytes) {
Status(SERVER_UNEXPECTED_ERROR, "out range of binlog data"),
nullptr);
}
auto res = std::shared_ptr<uint8_t[]>(new uint8_t[nbytes]);
std::memcpy(res.get(), data_.get() + tell_, nbytes);
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto res = std::shared_ptr<uint8_t[]>(data_.get() + tell_, deleter);
tell_ += nbytes;
return std::make_pair(Status(SERVER_SUCCESS, ""), res);
}

View File

@ -31,12 +31,6 @@ class BinlogReader {
: data_(binlog_data), size_(length), tell_(0) {
}
explicit BinlogReader(const uint8_t* binlog_data, int64_t length)
: size_(length), tell_(0) {
data_ = std::shared_ptr<uint8_t[]>(new uint8_t[length]);
std::memcpy(data_.get(), binlog_data, length);
}
Status
Read(int64_t nbytes, void* out);

View File

@ -31,22 +31,17 @@ set(STORAGE_FILES
PayloadReader.cpp
PayloadWriter.cpp
BinlogReader.cpp
FieldDataFactory.cpp
IndexData.cpp
InsertData.cpp
Event.cpp
ThreadPool.cpp
storage_c.cpp)
if(BUILD_DISK_ANN STREQUAL "ON")
set(STORAGE_FILES
${STORAGE_FILES}
LocalChunkManager.cpp
storage_c.cpp
MinioChunkManager.cpp
AliyunSTSClient.cpp
AliyunCredentialsProvider.cpp
MemFileManagerImpl.cpp
LocalChunkManager.cpp
DiskFileManagerImpl.cpp)
endif()
add_library(milvus_storage SHARED ${STORAGE_FILES})

View File

@ -20,6 +20,7 @@
#include <memory>
#include <string>
#include <vector>
#include <map>
namespace milvus::storage {
@ -112,23 +113,23 @@ class ChunkManager {
*/
virtual std::string
GetName() const = 0;
};
/**
* @brief RemoteChunkManager is responsible for read and write Remote file
* that inherited from ChunkManager.
/**
* @brief Get the Root Path
* @return std::string
*/
class RemoteChunkManager : public ChunkManager {
public:
virtual ~RemoteChunkManager() {
}
virtual std::string
GetName() const {
return "RemoteChunkManager";
}
GetRootPath() const = 0;
};
using RemoteChunkManagerPtr = std::unique_ptr<RemoteChunkManager>;
using ChunkManagerPtr = std::shared_ptr<ChunkManager>;
enum ChunkManagerType : int8_t {
None_CM = 0,
Local = 1,
Minio = 2,
};
extern std::map<std::string, ChunkManagerType> ChunkManagerType_Map;
} // namespace milvus::storage

View File

@ -40,7 +40,7 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) {
switch (header.event_type_) {
case EventType::InsertEvent: {
auto event_data_length =
header.event_length_ - header.next_position_;
header.event_length_ - GetEventHeaderSize(header);
auto insert_event_data =
InsertEventData(reader, event_data_length, data_type);
auto insert_data =
@ -52,11 +52,26 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) {
}
case EventType::IndexFileEvent: {
auto event_data_length =
header.event_length_ - header.next_position_;
header.event_length_ - GetEventHeaderSize(header);
auto index_event_data =
IndexEventData(reader, event_data_length, data_type);
auto index_data =
std::make_unique<IndexData>(index_event_data.field_data);
auto field_data = index_event_data.field_data;
// for compatible with golang indexcode.Serialize, which set dataType to String
if (data_type == DataType::STRING) {
AssertInfo(field_data->get_data_type() == DataType::STRING,
"wrong index type in index binlog file");
AssertInfo(
field_data->get_num_rows() == 1,
"wrong length of string num in old index binlog file");
auto new_field_data = CreateFieldData(DataType::INT8);
new_field_data->FillFieldData(
(*static_cast<const std::string*>(field_data->RawValue(0)))
.c_str(),
field_data->Size());
field_data = new_field_data;
}
auto index_data = std::make_unique<IndexData>(field_data);
index_data->SetFieldDataMeta(data_meta);
IndexMeta index_meta;
index_meta.segment_id = data_meta.segment_id;

View File

@ -22,59 +22,28 @@
#include "common/Common.h"
#include "common/Slice.h"
#include "log/Log.h"
#include "config/ConfigKnowhere.h"
#include "storage/DiskFileManagerImpl.h"
#include "storage/LocalChunkManager.h"
#include "storage/MinioChunkManager.h"
#include "storage/LocalChunkManagerSingleton.h"
#include "storage/Exception.h"
#include "storage/FieldData.h"
#include "storage/IndexData.h"
#include "storage/ThreadPool.h"
#include "storage/Util.h"
#include "storage/FieldDataFactory.h"
#define FILEMANAGER_TRY try {
#define FILEMANAGER_CATCH \
} \
catch (LocalChunkManagerException & e) { \
LOG_SEGCORE_ERROR_ << "LocalChunkManagerException:" << e.what(); \
return false; \
} \
catch (MinioException & e) { \
LOG_SEGCORE_ERROR_ << "milvus::storage::MinioException:" << e.what(); \
return false; \
} \
catch (DiskANNFileManagerException & e) { \
LOG_SEGCORE_ERROR_ << "milvus::storage::DiskANNFileManagerException:" \
<< e.what(); \
return false; \
} \
catch (ArrowException & e) { \
LOG_SEGCORE_ERROR_ << "milvus::storage::ArrowException:" << e.what(); \
return false; \
} \
catch (std::exception & e) { \
LOG_SEGCORE_ERROR_ << "Exception:" << e.what(); \
return false;
#define FILEMANAGER_END }
using ReadLock = std::shared_lock<std::shared_mutex>;
using WriteLock = std::lock_guard<std::shared_mutex>;
#include "storage/ThreadPool.h"
namespace milvus::storage {
DiskFileManagerImpl::DiskFileManagerImpl(const FieldDataMeta& field_meta,
DiskFileManagerImpl::DiskFileManagerImpl(const FieldDataMeta& field_mata,
IndexMeta index_meta,
const StorageConfig& storage_config)
: field_meta_(field_meta), index_meta_(std::move(index_meta)) {
remote_root_path_ = storage_config.remote_root_path;
rcm_ = std::make_unique<MinioChunkManager>(storage_config);
ChunkManagerPtr remote_chunk_manager)
: FileManagerImpl(field_mata, index_meta) {
rcm_ = remote_chunk_manager;
}
DiskFileManagerImpl::~DiskFileManagerImpl() {
auto& local_chunk_manager = LocalChunkManager::GetInstance();
local_chunk_manager.RemoveDir(
GetLocalIndexPathPrefixWithBuildID(index_meta_.build_id));
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
local_chunk_manager->RemoveDir(GetIndexPathPrefixWithBuildID(
local_chunk_manager, index_meta_.build_id));
}
bool
@ -82,38 +51,19 @@ DiskFileManagerImpl::LoadFile(const std::string& file) noexcept {
return true;
}
std::pair<std::string, size_t>
EncodeAndUploadIndexSlice(RemoteChunkManager* remote_chunk_manager,
const std::string& file,
int64_t offset,
int64_t batch_size,
const IndexMeta& index_meta,
FieldDataMeta field_meta,
std::string object_key) {
auto& local_chunk_manager = LocalChunkManager::GetInstance();
auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[batch_size]);
local_chunk_manager.Read(file, offset, buf.get(), batch_size);
auto field_data =
milvus::storage::FieldDataFactory::GetInstance().CreateFieldData(
DataType::INT8);
field_data->FillFieldData(buf.get(), batch_size);
auto indexData = std::make_shared<IndexData>(field_data);
indexData->set_index_meta(index_meta);
indexData->SetFieldDataMeta(field_meta);
auto serialized_index_data = indexData->serialize_to_remote_file();
auto serialized_index_size = serialized_index_data.size();
remote_chunk_manager->Write(
object_key, serialized_index_data.data(), serialized_index_size);
return std::make_pair(std::move(object_key), serialized_index_size);
std::string
DiskFileManagerImpl::GetRemoteIndexPath(const std::string& file_name,
int64_t slice_num) const {
auto remote_prefix = GetRemoteIndexObjectPrefix();
return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num);
}
bool
DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
auto& local_chunk_manager = LocalChunkManager::GetInstance();
auto& pool = ThreadPool::GetInstance();
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
FILEMANAGER_TRY
if (!local_chunk_manager.Exist(file)) {
if (!local_chunk_manager->Exist(file)) {
LOG_SEGCORE_ERROR_ << "local file: " << file << " does not exist ";
return false;
}
@ -122,15 +72,15 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
local_paths_.emplace_back(file);
auto fileName = GetFileName(file);
auto fileSize = local_chunk_manager.Size(file);
auto fileSize = local_chunk_manager->Size(file);
std::vector<std::string> batch_remote_files;
std::vector<int64_t> remote_file_sizes;
std::vector<int64_t> local_file_offsets;
int slice_num = 0;
auto parallel_degree = uint64_t(DEFAULT_DISK_INDEX_MAX_MEMORY_LIMIT /
(index_file_slice_size << 20));
auto parallel_degree =
uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE);
for (int64_t offset = 0; offset < fileSize; slice_num++) {
if (batch_remote_files.size() >= parallel_degree) {
AddBatchIndexFiles(file,
@ -142,10 +92,9 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
local_file_offsets.clear();
}
auto batch_size =
std::min(index_file_slice_size << 20, int64_t(fileSize) - offset);
auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset);
batch_remote_files.emplace_back(
GenerateRemoteIndexFile(fileName, slice_num));
GetRemoteIndexPath(fileName, slice_num));
remote_file_sizes.emplace_back(batch_size);
local_file_offsets.emplace_back(offset);
offset += batch_size;
@ -166,35 +115,57 @@ DiskFileManagerImpl::AddBatchIndexFiles(
const std::vector<int64_t>& local_file_offsets,
const std::vector<std::string>& remote_files,
const std::vector<int64_t>& remote_file_sizes) {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
auto& pool = ThreadPool::GetInstance();
std::vector<std::future<std::pair<std::string, size_t>>> futures;
auto LoadIndexFromDisk = [&](
const std::string& file,
const int64_t offset,
const int64_t data_size) -> std::shared_ptr<uint8_t[]> {
auto buf = std::shared_ptr<uint8_t[]>(new uint8_t[data_size]);
local_chunk_manager->Read(file, offset, buf.get(), data_size);
return buf;
};
std::vector<std::future<std::shared_ptr<uint8_t[]>>> futures;
AssertInfo(local_file_offsets.size() == remote_files.size(),
"inconsistent size of offset slices with file slices");
AssertInfo(remote_files.size() == remote_file_sizes.size(),
"inconsistent size of file slices with size slices");
for (int64_t i = 0; i < remote_files.size(); ++i) {
futures.push_back(pool.Submit(EncodeAndUploadIndexSlice,
rcm_.get(),
futures.push_back(pool.Submit(LoadIndexFromDisk,
local_file_name,
local_file_offsets[i],
remote_file_sizes[i],
index_meta_,
field_meta_,
remote_files[i]));
remote_file_sizes[i]));
}
// hold index data util upload index file done
std::vector<std::shared_ptr<uint8_t[]>> index_datas;
std::vector<const uint8_t*> data_slices;
for (auto& future : futures) {
auto res = future.get();
remote_paths_to_size_[res.first] = res.second;
index_datas.emplace_back(res);
data_slices.emplace_back(res.get());
}
auto res = PutIndexData(rcm_.get(),
data_slices,
remote_file_sizes,
remote_files,
field_meta_,
index_meta_);
for (auto iter = res.begin(); iter != res.end(); ++iter) {
remote_paths_to_size_[iter->first] = iter->second;
}
}
void
DiskFileManagerImpl::CacheIndexToDisk(
const std::vector<std::string>& remote_files) {
auto& local_chunk_manager = LocalChunkManager::GetInstance();
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
std::map<std::string, std::vector<int>> index_slices;
for (auto& file_path : remote_files) {
@ -209,7 +180,7 @@ DiskFileManagerImpl::CacheIndexToDisk(
auto EstimateParallelDegree = [&](const std::string& file) -> uint64_t {
auto fileSize = rcm_->Size(file);
return uint64_t(DEFAULT_DISK_INDEX_MAX_MEMORY_LIMIT / fileSize);
return uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / fileSize);
};
for (auto& slices : index_slices) {
@ -217,7 +188,7 @@ DiskFileManagerImpl::CacheIndexToDisk(
auto local_index_file_name =
GetLocalIndexObjectPrefix() +
prefix.substr(prefix.find_last_of('/') + 1);
local_chunk_manager.CreateFile(local_index_file_name);
local_chunk_manager->CreateFile(local_index_file_name);
int64_t offset = 0;
std::vector<std::string> batch_remote_files;
uint64_t max_parallel_degree = INT_MAX;
@ -245,72 +216,125 @@ DiskFileManagerImpl::CacheIndexToDisk(
}
}
std::unique_ptr<DataCodec>
DownloadAndDecodeRemoteIndexfile(RemoteChunkManager* remote_chunk_manager,
const std::string& file) {
auto fileSize = remote_chunk_manager->Size(file);
auto buf = std::shared_ptr<uint8_t[]>(new uint8_t[fileSize]);
remote_chunk_manager->Read(file, buf.get(), fileSize);
return DeserializeFileData(buf, fileSize);
}
uint64_t
DiskFileManagerImpl::CacheBatchIndexFilesToDisk(
const std::vector<std::string>& remote_files,
const std::string& local_file_name,
uint64_t local_file_init_offfset) {
auto& local_chunk_manager = LocalChunkManager::GetInstance();
auto& pool = ThreadPool::GetInstance();
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
auto index_datas = GetObjectData(rcm_.get(), remote_files);
int batch_size = remote_files.size();
std::vector<std::future<std::unique_ptr<DataCodec>>> futures;
for (int i = 0; i < batch_size; ++i) {
futures.push_back(pool.Submit(
DownloadAndDecodeRemoteIndexfile, rcm_.get(), remote_files[i]));
}
AssertInfo(index_datas.size() == batch_size,
"inconsistent file num and index data num!");
uint64_t offset = local_file_init_offfset;
for (int i = 0; i < batch_size; ++i) {
auto res = futures[i].get();
auto index_data = res->GetFieldData();
auto index_data = index_datas[i];
auto index_size = index_data->Size();
local_chunk_manager.Write(
local_file_name,
offset,
reinterpret_cast<uint8_t*>(const_cast<void*>(index_data->Data())),
index_size);
auto uint8_data =
reinterpret_cast<uint8_t*>(const_cast<void*>(index_data->Data()));
local_chunk_manager->Write(
local_file_name, offset, uint8_data, index_size);
offset += index_size;
}
return offset;
}
std::string
DiskFileManagerImpl::CacheRawDataToDisk(std::vector<std::string> remote_files) {
std::sort(remote_files.begin(),
remote_files.end(),
[](const std::string& a, const std::string& b) {
return std::stol(a.substr(a.find_last_of("/") + 1)) <
std::stol(b.substr(b.find_last_of("/") + 1));
});
auto segment_id = GetFieldDataMeta().segment_id;
auto field_id = GetFieldDataMeta().field_id;
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
auto local_data_path = storage::GenFieldRawDataPathPrefix(
local_chunk_manager, segment_id, field_id) +
"raw_data";
local_chunk_manager->CreateFile(local_data_path);
// get batch raw data from s3 and write batch data to disk file
// TODO: load and write of different batches at the same time
std::vector<std::string> batch_files;
// file format
// num_rows(uint32) | dim(uint32) | index_data ([]uint8_t)
uint32_t num_rows = 0;
uint32_t dim = 0;
int64_t write_offset = sizeof(num_rows) + sizeof(dim);
auto FetchRawData = [&]() {
auto field_datas = GetObjectData(rcm_.get(), batch_files);
int batch_size = batch_files.size();
for (int i = 0; i < batch_size; ++i) {
auto field_data = field_datas[i];
num_rows += uint32_t(field_data->get_num_rows());
AssertInfo(dim == 0 || dim == field_data->get_dim(),
"inconsistent dim value in multi binlogs!");
dim = field_data->get_dim();
auto data_size = field_data->get_num_rows() * dim * sizeof(float);
local_chunk_manager->Write(local_data_path,
write_offset,
const_cast<void*>(field_data->Data()),
data_size);
write_offset += data_size;
}
};
auto parallel_degree =
uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE);
for (auto& file : remote_files) {
if (batch_files.size() >= parallel_degree) {
FetchRawData();
batch_files.clear();
}
batch_files.emplace_back(file);
}
if (batch_files.size() > 0) {
FetchRawData();
}
// write num_rows and dim value to file header
write_offset = 0;
local_chunk_manager->Write(
local_data_path, write_offset, &num_rows, sizeof(num_rows));
write_offset += sizeof(num_rows);
local_chunk_manager->Write(
local_data_path, write_offset, &dim, sizeof(dim));
return local_data_path;
}
std::string
DiskFileManagerImpl::GetFileName(const std::string& localfile) {
boost::filesystem::path localPath(localfile);
return localPath.filename().string();
}
std::string
DiskFileManagerImpl::GetRemoteIndexObjectPrefix() const {
return remote_root_path_ + "/" + std::string(INDEX_ROOT_PATH) + "/" +
std::to_string(index_meta_.build_id) + "/" +
std::to_string(index_meta_.index_version) + "/" +
std::to_string(field_meta_.partition_id) + "/" +
std::to_string(field_meta_.segment_id);
}
std::string
DiskFileManagerImpl::GetLocalIndexObjectPrefix() {
return GenLocalIndexPathPrefix(index_meta_.build_id,
index_meta_.index_version);
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
return GenIndexPathPrefix(
local_chunk_manager, index_meta_.build_id, index_meta_.index_version);
}
std::string
DiskFileManagerImpl::GetLocalRawDataObjectPrefix() {
return GenFieldRawDataPathPrefix(field_meta_.segment_id,
field_meta_.field_id);
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
return GenFieldRawDataPathPrefix(
local_chunk_manager, field_meta_.segment_id, field_meta_.field_id);
}
bool
@ -322,9 +346,10 @@ DiskFileManagerImpl::RemoveFile(const std::string& file) noexcept {
std::optional<bool>
DiskFileManagerImpl::IsExisted(const std::string& file) noexcept {
bool isExist = false;
auto& local_chunk_manager = LocalChunkManager::GetInstance();
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
try {
isExist = local_chunk_manager.Exist(file);
isExist = local_chunk_manager->Exist(file);
} catch (LocalChunkManagerException& e) {
// LOG_SEGCORE_DEBUG_ << "LocalChunkManagerException:"
// << e.what();

View File

@ -24,8 +24,7 @@
#include "storage/IndexData.h"
#include "storage/FileManager.h"
#include "storage/LocalChunkManager.h"
#include "storage/MinioChunkManager.h"
#include "storage/ChunkManager.h"
#include "common/Consts.h"
@ -33,9 +32,9 @@ namespace milvus::storage {
class DiskFileManagerImpl : public FileManagerImpl {
public:
explicit DiskFileManagerImpl(const FieldDataMeta& field_meta,
explicit DiskFileManagerImpl(const FieldDataMeta& field_mata,
IndexMeta index_meta,
const StorageConfig& storage_config);
ChunkManagerPtr remote_chunk_manager);
virtual ~DiskFileManagerImpl();
@ -57,9 +56,6 @@ class DiskFileManagerImpl : public FileManagerImpl {
return "DiskFileManagerImpl";
}
std::string
GetRemoteIndexObjectPrefix() const;
std::string
GetLocalIndexObjectPrefix();
@ -76,13 +72,6 @@ class DiskFileManagerImpl : public FileManagerImpl {
return local_paths_;
}
std::string
GenerateRemoteIndexFile(const std::string& file_name,
int64_t slice_num) const {
return GetRemoteIndexObjectPrefix() + "/" + file_name + "_" +
std::to_string(slice_num);
}
void
CacheIndexToDisk(const std::vector<std::string>& remote_files);
@ -97,15 +86,8 @@ class DiskFileManagerImpl : public FileManagerImpl {
const std::vector<std::string>& remote_files,
const std::vector<int64_t>& remote_file_sizes);
FieldDataMeta
GetFileDataMeta() const {
return field_meta_;
}
IndexMeta
GetIndexMeta() const {
return index_meta_;
}
std::string
CacheRawDataToDisk(std::vector<std::string> remote_files);
private:
int64_t
@ -116,21 +98,15 @@ class DiskFileManagerImpl : public FileManagerImpl {
std::string
GetFileName(const std::string& localfile);
std::string
GetRemoteIndexPath(const std::string& file_name, int64_t slice_num) const;
private:
// collection meta
FieldDataMeta field_meta_;
// index meta
IndexMeta index_meta_;
// local file path (abs path)
std::vector<std::string> local_paths_;
// remote file path
std::map<std::string, int64_t> remote_paths_to_size_;
RemoteChunkManagerPtr rcm_;
std::string remote_root_path_;
};
using DiskANNFileManagerImplPtr = std::shared_ptr<DiskFileManagerImpl>;

View File

@ -15,10 +15,8 @@
// limitations under the License.
#include "storage/Event.h"
#include "storage/Util.h"
#include "storage/PayloadReader.h"
#include "storage/PayloadWriter.h"
#include "storage/FieldDataFactory.h"
#include "exceptions/EasyAssert.h"
#include "utils/Json.h"
#include "common/Consts.h"
@ -219,19 +217,42 @@ BaseEventData::Serialize() {
} else {
payload_writer = std::make_unique<PayloadWriter>(data_type);
}
if (datatype_is_string(data_type)) {
for (size_t offset = 0; offset < field_data->get_num_rows(); ++offset) {
payload_writer->add_one_string_payload(
reinterpret_cast<const char*>(field_data->RawValue(offset)),
field_data->get_element_size(offset));
switch (data_type) {
case DataType::VARCHAR:
case DataType::STRING: {
for (size_t offset = 0; offset < field_data->get_num_rows();
++offset) {
auto str = static_cast<const std::string*>(
field_data->RawValue(offset));
payload_writer->add_one_string_payload(str->c_str(),
str->size());
}
} else {
auto payload = Payload{data_type,
break;
}
case DataType::ARRAY:
case DataType::JSON: {
for (size_t offset = 0; offset < field_data->get_num_rows();
++offset) {
auto string_view =
static_cast<const Json*>(field_data->RawValue(offset))
->data();
payload_writer->add_one_binary_payload(
reinterpret_cast<const uint8_t*>(
std::string(string_view).c_str()),
string_view.size());
}
break;
}
default: {
auto payload =
Payload{data_type,
static_cast<const uint8_t*>(field_data->Data()),
field_data->get_num_rows(),
field_data->get_dim()};
payload_writer->add_payload(payload);
}
}
payload_writer->finish();
auto payload_buffer = payload_writer->get_payload_buffer();
auto len =
@ -250,7 +271,7 @@ BaseEventData::Serialize() {
BaseEvent::BaseEvent(BinlogReaderPtr reader, DataType data_type) {
event_header = EventHeader(reader);
auto event_data_length =
event_header.event_length_ - event_header.next_position_;
event_header.event_length_ - GetEventHeaderSize(event_header);
event_data = BaseEventData(reader, event_data_length, data_type);
}
@ -259,8 +280,8 @@ BaseEvent::Serialize() {
auto data = event_data.Serialize();
int data_size = data.size();
event_header.next_position_ = GetEventHeaderSize(event_header);
event_header.event_length_ = event_header.next_position_ + data_size;
event_header.event_length_ = GetEventHeaderSize(event_header) + data_size;
event_header.next_position_ = event_header.event_length_ + event_offset;
auto header = event_header.Serialize();
int header_size = header.size();
@ -281,12 +302,11 @@ DescriptorEvent::DescriptorEvent(BinlogReaderPtr reader) {
std::vector<uint8_t>
DescriptorEvent::Serialize() {
event_header.event_type_ = EventType::DescriptorEvent;
auto data = event_data.Serialize();
int data_size = data.size();
event_header.event_type_ = EventType::DescriptorEvent;
event_header.next_position_ = GetEventHeaderSize(event_header);
event_header.event_length_ = event_header.next_position_ + data_size;
event_header.event_length_ = GetEventHeaderSize(event_header) + data_size;
auto header = event_header.Serialize();
int header_size = header.size();
@ -298,6 +318,8 @@ DescriptorEvent::Serialize() {
memcpy(res.data() + offset, header.data(), header_size);
offset += header_size;
memcpy(res.data() + offset, data.data(), data_size);
offset += data_size;
event_header.next_position_ = offset;
return res;
}

View File

@ -99,6 +99,7 @@ struct DescriptorEvent {
struct BaseEvent {
EventHeader event_header;
BaseEventData event_data;
int64_t event_offset;
BaseEvent() = default;
explicit BaseEvent(BinlogReaderPtr reader, DataType data_type);

View File

@ -15,6 +15,7 @@
// limitations under the License.
#include "storage/FieldData.h"
#include "common/Json.h"
namespace milvus::storage {
@ -22,14 +23,28 @@ template <typename Type, bool is_scalar>
void
FieldDataImpl<Type, is_scalar>::FillFieldData(const void* source,
ssize_t element_count) {
AssertInfo(element_count % dim_ == 0, "invalid element count");
if (element_count == 0) {
return;
}
AssertInfo(field_data_.size() == 0, "no empty field vector");
field_data_.resize(element_count);
std::copy_n(
static_cast<const Type*>(source), element_count, field_data_.data());
std::lock_guard lck(tell_mutex_);
if (tell_ + element_count > get_num_rows()) {
resize_field_data(tell_ + element_count);
}
std::copy_n(static_cast<const Type*>(source),
element_count * dim_,
field_data_.data() + tell_ * dim_);
tell_ += element_count;
}
template <typename ArrayType, arrow::Type::type ArrayDataType>
std::pair<const void*, int64_t>
GetDataInfoFromArray(const std::shared_ptr<arrow::Array> array) {
AssertInfo(array->type()->id() == ArrayDataType, "inconsistent data type");
auto typed_array = std::dynamic_pointer_cast<ArrayType>(array);
auto element_count = array->length();
return std::make_pair(typed_array->raw_values(), element_count);
}
template <typename Type, bool is_scalar>
@ -37,7 +52,7 @@ void
FieldDataImpl<Type, is_scalar>::FillFieldData(
const std::shared_ptr<arrow::Array> array) {
AssertInfo(array != nullptr, "null arrow array");
auto element_count = array->length() * dim_;
auto element_count = array->length();
if (element_count == 0) {
return;
}
@ -54,46 +69,40 @@ FieldDataImpl<Type, is_scalar>::FillFieldData(
return FillFieldData(values.data(), element_count);
}
case DataType::INT8: {
AssertInfo(array->type()->id() == arrow::Type::type::INT8,
"inconsistent data type");
auto int8_array =
std::dynamic_pointer_cast<arrow::Int8Array>(array);
return FillFieldData(int8_array->raw_values(), element_count);
auto array_info =
GetDataInfoFromArray<arrow::Int8Array, arrow::Type::type::INT8>(
array);
return FillFieldData(array_info.first, array_info.second);
}
case DataType::INT16: {
AssertInfo(array->type()->id() == arrow::Type::type::INT16,
"inconsistent data type");
auto int16_array =
std::dynamic_pointer_cast<arrow::Int16Array>(array);
return FillFieldData(int16_array->raw_values(), element_count);
auto array_info =
GetDataInfoFromArray<arrow::Int16Array,
arrow::Type::type::INT16>(array);
return FillFieldData(array_info.first, array_info.second);
}
case DataType::INT32: {
AssertInfo(array->type()->id() == arrow::Type::type::INT32,
"inconsistent data type");
auto int32_array =
std::dynamic_pointer_cast<arrow::Int32Array>(array);
return FillFieldData(int32_array->raw_values(), element_count);
auto array_info =
GetDataInfoFromArray<arrow::Int32Array,
arrow::Type::type::INT32>(array);
return FillFieldData(array_info.first, array_info.second);
}
case DataType::INT64: {
AssertInfo(array->type()->id() == arrow::Type::type::INT64,
"inconsistent data type");
auto int64_array =
std::dynamic_pointer_cast<arrow::Int64Array>(array);
return FillFieldData(int64_array->raw_values(), element_count);
auto array_info =
GetDataInfoFromArray<arrow::Int64Array,
arrow::Type::type::INT64>(array);
return FillFieldData(array_info.first, array_info.second);
}
case DataType::FLOAT: {
AssertInfo(array->type()->id() == arrow::Type::type::FLOAT,
"inconsistent data type");
auto float_array =
std::dynamic_pointer_cast<arrow::FloatArray>(array);
return FillFieldData(float_array->raw_values(), element_count);
auto array_info =
GetDataInfoFromArray<arrow::FloatArray,
arrow::Type::type::FLOAT>(array);
return FillFieldData(array_info.first, array_info.second);
}
case DataType::DOUBLE: {
AssertInfo(array->type()->id() == arrow::Type::type::DOUBLE,
"inconsistent data type");
auto double_array =
std::dynamic_pointer_cast<arrow::DoubleArray>(array);
return FillFieldData(double_array->raw_values(), element_count);
auto array_info =
GetDataInfoFromArray<arrow::DoubleArray,
arrow::Type::type::DOUBLE>(array);
return FillFieldData(array_info.first, array_info.second);
}
case DataType::STRING:
case DataType::VARCHAR: {
@ -107,21 +116,25 @@ FieldDataImpl<Type, is_scalar>::FillFieldData(
}
return FillFieldData(values.data(), element_count);
}
case DataType::VECTOR_FLOAT: {
AssertInfo(
array->type()->id() == arrow::Type::type::FIXED_SIZE_BINARY,
case DataType::JSON: {
AssertInfo(array->type()->id() == arrow::Type::type::BINARY,
"inconsistent data type");
auto vector_array =
std::dynamic_pointer_cast<arrow::FixedSizeBinaryArray>(array);
return FillFieldData(vector_array->raw_values(), element_count);
auto json_array =
std::dynamic_pointer_cast<arrow::BinaryArray>(array);
std::vector<Json> values(element_count);
for (size_t index = 0; index < element_count; ++index) {
values[index] =
Json(simdjson::padded_string(json_array->GetString(index)));
}
return FillFieldData(values.data(), element_count);
}
case DataType::VECTOR_FLOAT:
case DataType::VECTOR_BINARY: {
AssertInfo(
array->type()->id() == arrow::Type::type::FIXED_SIZE_BINARY,
"inconsistent data type");
auto vector_array =
std::dynamic_pointer_cast<arrow::FixedSizeBinaryArray>(array);
return FillFieldData(vector_array->raw_values(), element_count);
auto array_info =
GetDataInfoFromArray<arrow::FixedSizeBinaryArray,
arrow::Type::type::FIXED_SIZE_BINARY>(
array);
return FillFieldData(array_info.first, array_info.second);
}
default: {
throw NotSupportedDataTypeException(GetName() + "::FillFieldData" +
@ -141,6 +154,7 @@ template class FieldDataImpl<int64_t, true>;
template class FieldDataImpl<float, true>;
template class FieldDataImpl<double, true>;
template class FieldDataImpl<std::string, true>;
template class FieldDataImpl<Json, true>;
// vector data
template class FieldDataImpl<int8_t, false>;

View File

@ -27,8 +27,9 @@ template <typename Type>
class FieldData : public FieldDataImpl<Type, true> {
public:
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
explicit FieldData(DataType data_type)
: FieldDataImpl<Type, true>::FieldDataImpl(1, data_type) {
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
: FieldDataImpl<Type, true>::FieldDataImpl(
1, data_type, buffered_num_rows) {
}
};
@ -36,23 +37,39 @@ template <>
class FieldData<std::string> : public FieldDataStringImpl {
public:
static_assert(IsScalar<std::string> || std::is_same_v<std::string, PkType>);
explicit FieldData(DataType data_type) : FieldDataStringImpl(data_type) {
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
: FieldDataStringImpl(data_type, buffered_num_rows) {
}
};
template <>
class FieldData<Json> : public FieldDataJsonImpl {
public:
static_assert(IsScalar<std::string> || std::is_same_v<std::string, PkType>);
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
: FieldDataJsonImpl(data_type, buffered_num_rows) {
}
};
template <>
class FieldData<FloatVector> : public FieldDataImpl<float, false> {
public:
explicit FieldData(int64_t dim, DataType data_type)
: FieldDataImpl<float, false>::FieldDataImpl(dim, data_type) {
explicit FieldData(int64_t dim,
DataType data_type,
int64_t buffered_num_rows = 0)
: FieldDataImpl<float, false>::FieldDataImpl(
dim, data_type, buffered_num_rows) {
}
};
template <>
class FieldData<BinaryVector> : public FieldDataImpl<uint8_t, false> {
public:
explicit FieldData(int64_t dim, DataType data_type)
: binary_dim_(dim), FieldDataImpl(dim / 8, data_type) {
explicit FieldData(int64_t dim,
DataType data_type,
int64_t buffered_num_rows = 0)
: binary_dim_(dim),
FieldDataImpl(dim / 8, data_type, buffered_num_rows) {
Assert(dim % 8 == 0);
}
@ -66,4 +83,5 @@ class FieldData<BinaryVector> : public FieldDataImpl<uint8_t, false> {
};
using FieldDataPtr = std::shared_ptr<FieldDataBase>;
} // namespace milvus::storage

View File

@ -1,53 +0,0 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "storage/FieldDataFactory.h"
#include "storage/Exception.h"
namespace milvus::storage {
FieldDataPtr
FieldDataFactory::CreateFieldData(const DataType& type, const int64_t dim) {
switch (type) {
case DataType::BOOL:
return std::make_shared<FieldData<bool>>(type);
case DataType::INT8:
return std::make_shared<FieldData<int8_t>>(type);
case DataType::INT16:
return std::make_shared<FieldData<int16_t>>(type);
case DataType::INT32:
return std::make_shared<FieldData<int32_t>>(type);
case DataType::INT64:
return std::make_shared<FieldData<int64_t>>(type);
case DataType::FLOAT:
return std::make_shared<FieldData<float>>(type);
case DataType::DOUBLE:
return std::make_shared<FieldData<double>>(type);
case DataType::STRING:
case DataType::VARCHAR:
return std::make_shared<FieldData<std::string>>(type);
case DataType::VECTOR_FLOAT:
return std::make_shared<FieldData<FloatVector>>(dim, type);
case DataType::VECTOR_BINARY:
return std::make_shared<FieldData<BinaryVector>>(dim, type);
default:
throw NotSupportedDataTypeException(
GetName() + "::CreateFieldData" + " not support data type " +
datatype_name(type));
}
}
} // namespace milvus::storage

View File

@ -20,6 +20,8 @@
#include <memory>
#include <vector>
#include <string>
#include <mutex>
#include <shared_mutex>
#include "arrow/api.h"
#include "common/FieldMeta.h"
@ -53,16 +55,19 @@ class FieldDataBase {
virtual int64_t
Size() const = 0;
virtual int64_t
Size(ssize_t index) const = 0;
virtual bool
IsFull() const = 0;
public:
virtual int
virtual int64_t
get_num_rows() const = 0;
virtual int64_t
get_dim() const = 0;
virtual int64_t
get_element_size(ssize_t offset) const = 0;
DataType
get_data_type() const {
return data_type_;
@ -86,8 +91,14 @@ class FieldDataImpl : public FieldDataBase {
operator=(const FieldDataImpl&) = delete;
public:
explicit FieldDataImpl(ssize_t dim, DataType data_type)
: FieldDataBase(data_type), dim_(is_scalar ? 1 : dim) {
explicit FieldDataImpl(ssize_t dim,
DataType data_type,
int64_t buffered_num_rows = 0)
: FieldDataBase(data_type),
dim_(is_scalar ? 1 : dim),
num_rows_(buffered_num_rows),
tell_(0) {
field_data_.resize(num_rows_ * dim_);
}
void
@ -108,20 +119,54 @@ class FieldDataImpl : public FieldDataBase {
const void*
RawValue(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < get_tell(),
"subscript position don't has valid value");
return &field_data_[offset];
}
int64_t
Size() const override {
return sizeof(Type) * field_data_.size();
return sizeof(Type) * get_tell() * dim_;
}
int64_t
Size(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < get_tell(),
"subscript position don't has valid value");
return sizeof(Type) * dim_;
}
bool
IsFull() const override {
auto buffered_num_rows = get_num_rows();
auto filled_num_rows = get_tell();
return buffered_num_rows == filled_num_rows;
}
public:
int
int64_t
get_num_rows() const override {
auto len = field_data_.size();
AssertInfo(len % dim_ == 0, "field data size not aligned");
return len / dim_;
std::shared_lock lck(num_rows_mutex_);
return num_rows_;
}
void
resize_field_data(int64_t num_rows) {
std::lock_guard lck(num_rows_mutex_);
if (num_rows > num_rows_) {
num_rows_ = num_rows;
field_data_.resize(num_rows_ * dim_);
}
}
int64_t
get_tell() const {
std::shared_lock lck(tell_mutex_);
return tell_;
}
int64_t
@ -129,13 +174,12 @@ class FieldDataImpl : public FieldDataBase {
return dim_;
}
int64_t
get_element_size(ssize_t offset) const override {
return sizeof(Type) * dim_;
}
protected:
Chunk field_data_;
int64_t num_rows_;
mutable std::shared_mutex num_rows_mutex_;
int64_t tell_;
mutable std::shared_mutex tell_mutex_;
private:
const ssize_t dim_;
@ -143,30 +187,54 @@ class FieldDataImpl : public FieldDataBase {
class FieldDataStringImpl : public FieldDataImpl<std::string, true> {
public:
explicit FieldDataStringImpl(DataType data_type)
: FieldDataImpl<std::string, true>(1, data_type) {
}
const void*
RawValue(ssize_t offset) const {
return field_data_[offset].c_str();
explicit FieldDataStringImpl(DataType data_type, int64_t total_num_rows = 0)
: FieldDataImpl<std::string, true>(1, data_type, total_num_rows) {
}
int64_t
Size() const {
int64_t data_size = 0;
for (size_t offset = 0; offset < field_data_.size(); ++offset) {
data_size += get_element_size(offset);
for (size_t offset = 0; offset < get_tell(); ++offset) {
data_size += field_data_[offset].size();
}
return data_size;
}
public:
int64_t
get_element_size(ssize_t offset) const {
Size(ssize_t offset) const {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < get_tell(),
"subscript position don't has valid value");
return field_data_[offset].size();
}
};
class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
public:
explicit FieldDataJsonImpl(DataType data_type, int64_t total_num_rows = 0)
: FieldDataImpl<Json, true>(1, data_type, total_num_rows) {
}
int64_t
Size() const {
int64_t data_size = 0;
for (size_t offset = 0; offset < get_tell(); ++offset) {
data_size += field_data_[offset].data().size();
}
return data_size;
}
int64_t
Size(ssize_t offset) const {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < get_tell(),
"subscript position don't has valid value");
return field_data_[offset].data().size();
}
};
} // namespace milvus::storage

View File

@ -21,10 +21,45 @@
#include <memory>
#include "knowhere/file_manager.h"
#include "common/Consts.h"
#include "storage/ChunkManager.h"
#include "storage/Types.h"
#include "log/Log.h"
namespace milvus::storage {
#define FILEMANAGER_TRY try {
#define FILEMANAGER_CATCH \
} \
catch (LocalChunkManagerException & e) { \
LOG_SEGCORE_ERROR_ << "LocalChunkManagerException:" << e.what(); \
return false; \
} \
catch (MinioException & e) { \
LOG_SEGCORE_ERROR_ << "milvus::storage::MinioException:" << e.what(); \
return false; \
} \
catch (DiskANNFileManagerException & e) { \
LOG_SEGCORE_ERROR_ << "milvus::storage::DiskANNFileManagerException:" \
<< e.what(); \
return false; \
} \
catch (ArrowException & e) { \
LOG_SEGCORE_ERROR_ << "milvus::storage::ArrowException:" << e.what(); \
return false; \
} \
catch (std::exception & e) { \
LOG_SEGCORE_ERROR_ << "Exception:" << e.what(); \
return false;
#define FILEMANAGER_END }
class FileManagerImpl : public knowhere::FileManager {
public:
explicit FileManagerImpl(const FieldDataMeta& field_mata,
IndexMeta index_meta)
: field_meta_(field_mata), index_meta_(std::move(index_meta)) {
}
public:
/**
* @brief Load a file to the local disk, so we can use stl lib to operate it.
@ -61,6 +96,37 @@ class FileManagerImpl : public knowhere::FileManager {
*/
virtual bool
RemoveFile(const std::string& filename) noexcept = 0;
public:
virtual std::string
GetName() const = 0;
virtual FieldDataMeta
GetFieldDataMeta() const {
return field_meta_;
}
virtual IndexMeta
GetIndexMeta() const {
return index_meta_;
}
virtual std::string
GetRemoteIndexObjectPrefix() const {
return rcm_->GetRootPath() + "/" + std::string(INDEX_ROOT_PATH) + "/" +
std::to_string(index_meta_.build_id) + "/" +
std::to_string(index_meta_.index_version) + "/" +
std::to_string(field_meta_.partition_id) + "/" +
std::to_string(field_meta_.segment_id);
}
protected:
// collection meta
FieldDataMeta field_meta_;
// index meta
IndexMeta index_meta_;
ChunkManagerPtr rcm_;
};
using FileManagerImplPtr = std::shared_ptr<FileManagerImpl>;

View File

@ -51,20 +51,6 @@ IndexData::serialize_to_remote_file() {
AssertInfo(index_meta_.has_value(), "index meta not exist");
AssertInfo(field_data_ != nullptr, "empty field data");
// create index event
IndexEvent index_event;
auto& index_event_data = index_event.event_data;
index_event_data.start_timestamp = time_range_.first;
index_event_data.end_timestamp = time_range_.second;
index_event_data.field_data = field_data_;
auto& index_event_header = index_event.event_header;
index_event_header.event_type_ = EventType::IndexFileEvent;
// TODO :: set timestamps
index_event_header.timestamp_ = 0;
// serialize insert event
auto index_event_bytes = index_event.Serialize();
DataType data_type = field_data_->get_data_type();
// create descriptor event
@ -96,6 +82,22 @@ IndexData::serialize_to_remote_file() {
// serialize descriptor event data
auto des_event_bytes = descriptor_event.Serialize();
// create index event
IndexEvent index_event;
index_event.event_offset = des_event_bytes.size();
auto& index_event_data = index_event.event_data;
index_event_data.start_timestamp = time_range_.first;
index_event_data.end_timestamp = time_range_.second;
index_event_data.field_data = field_data_;
auto& index_event_header = index_event.event_header;
index_event_header.event_type_ = EventType::IndexFileEvent;
// TODO :: set timestamps
index_event_header.timestamp_ = 0;
// serialize insert event
auto index_event_bytes = index_event.Serialize();
des_event_bytes.insert(des_event_bytes.end(),
index_event_bytes.begin(),
index_event_bytes.end());

View File

@ -47,20 +47,6 @@ InsertData::serialize_to_remote_file() {
AssertInfo(field_data_meta_.has_value(), "field data not exist");
AssertInfo(field_data_ != nullptr, "empty field data");
// create insert event
InsertEvent insert_event;
auto& insert_event_data = insert_event.event_data;
insert_event_data.start_timestamp = time_range_.first;
insert_event_data.end_timestamp = time_range_.second;
insert_event_data.field_data = field_data_;
auto& insert_event_header = insert_event.event_header;
// TODO :: set timestamps
insert_event_header.timestamp_ = 0;
insert_event_header.event_type_ = EventType::InsertEvent;
// serialize insert event
auto insert_event_bytes = insert_event.Serialize();
DataType data_type = field_data_->get_data_type();
// create descriptor event
@ -90,6 +76,22 @@ InsertData::serialize_to_remote_file() {
// serialize descriptor event data
auto des_event_bytes = descriptor_event.Serialize();
// create insert event
InsertEvent insert_event;
insert_event.event_offset = des_event_bytes.size();
auto& insert_event_data = insert_event.event_data;
insert_event_data.start_timestamp = time_range_.first;
insert_event_data.end_timestamp = time_range_.second;
insert_event_data.field_data = field_data_;
auto& insert_event_header = insert_event.event_header;
// TODO :: set timestamps
insert_event_header.timestamp_ = 0;
insert_event_header.event_type_ = EventType::InsertEvent;
// serialize insert event
auto insert_event_bytes = insert_event.Serialize();
des_event_bytes.insert(des_event_bytes.end(),
insert_event_bytes.begin(),
insert_event_bytes.end());

View File

@ -103,6 +103,11 @@ void
LocalChunkManager::Write(const std::string& absPathStr,
void* buf,
uint64_t size) {
boost::filesystem::path absPath(absPathStr);
// if filepath not exists, will create this file automatically
// ensure upper directory exist firstly
boost::filesystem::create_directories(absPath.parent_path());
std::ofstream outfile;
outfile.open(absPathStr.data(), std::ios_base::binary);
if (outfile.fail()) {
@ -124,6 +129,11 @@ LocalChunkManager::Write(const std::string& absPathStr,
uint64_t offset,
void* buf,
uint64_t size) {
boost::filesystem::path absPath(absPathStr);
// if filepath not exists, will create this file automatically
// ensure upper directory exist firstly
boost::filesystem::create_directories(absPath.parent_path());
std::ofstream outfile;
outfile.open(
absPathStr.data(),

View File

@ -21,7 +21,6 @@
#include <vector>
#include "storage/ChunkManager.h"
#include "config/ConfigChunkManager.h"
namespace milvus::storage {
@ -30,7 +29,7 @@ namespace milvus::storage {
* that inherited from ChunkManager
*/
class LocalChunkManager : public ChunkManager {
private:
public:
explicit LocalChunkManager(const std::string& path) : path_prefix_(path) {
}
@ -39,14 +38,6 @@ class LocalChunkManager : public ChunkManager {
operator=(const LocalChunkManager&);
public:
static LocalChunkManager&
GetInstance() {
// thread-safe enough after c++ 11
static LocalChunkManager instance(
ChunkMangerConfig::GetLocalRootPath());
return instance;
}
virtual ~LocalChunkManager() {
}
@ -110,16 +101,11 @@ class LocalChunkManager : public ChunkManager {
return "LocalChunkManager";
}
inline std::string
GetPathPrefix() {
virtual std::string
GetRootPath() const {
return path_prefix_;
}
inline void
SetPathPrefix(const std::string& path) {
path_prefix_ = path;
}
bool
CreateFile(const std::string& filepath);

View File

@ -0,0 +1,67 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <shared_mutex>
#include "storage/ChunkManager.h"
#include "storage/LocalChunkManager.h"
namespace milvus::storage {
class LocalChunkManagerSingleton {
private:
LocalChunkManagerSingleton() {
}
public:
LocalChunkManagerSingleton(const LocalChunkManagerSingleton&) = delete;
LocalChunkManagerSingleton&
operator=(const LocalChunkManagerSingleton&) = delete;
static LocalChunkManagerSingleton&
GetInstance() {
static LocalChunkManagerSingleton instance;
return instance;
}
void
Init(std::string root_path) {
std::unique_lock lck(mutex_);
if (lcm_ == nullptr) {
lcm_ = std::make_shared<LocalChunkManager>(root_path);
}
}
void
Release() {
std::unique_lock lck(mutex_);
lcm_ = nullptr;
}
LocalChunkManagerSPtr
GetChunkManager() {
return lcm_;
}
private:
mutable std::shared_mutex mutex_;
LocalChunkManagerSPtr lcm_ = nullptr;
};
} // namespace milvus::storage

View File

@ -0,0 +1,169 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "storage/MemFileManagerImpl.h"
#include "storage/Util.h"
#include "common/Common.h"
namespace milvus::storage {
MemFileManagerImpl::MemFileManagerImpl(const FieldDataMeta& field_mata,
IndexMeta index_meta,
ChunkManagerPtr remote_chunk_manager)
: FileManagerImpl(field_mata, index_meta) {
rcm_ = remote_chunk_manager;
}
bool
MemFileManagerImpl::AddFile(const std::string& filename /* unused */) noexcept {
return false;
}
bool
MemFileManagerImpl::AddFile(const BinarySet& binary_set) noexcept {
std::vector<const uint8_t*> data_slices;
std::vector<int64_t> slice_sizes;
std::vector<std::string> slice_names;
auto AddBatchIndexFiles = [&]() {
auto res = PutIndexData(rcm_.get(),
data_slices,
slice_sizes,
slice_names,
field_meta_,
index_meta_);
for (auto& [file, size] : res) {
remote_paths_to_size_[file] = size;
}
};
auto remotePrefix = GetRemoteIndexObjectPrefix();
int64_t batch_size = 0;
for (auto iter = binary_set.binary_map_.begin();
iter != binary_set.binary_map_.end();
iter++) {
if (batch_size >= DEFAULT_FIELD_MAX_MEMORY_LIMIT) {
AddBatchIndexFiles();
data_slices.clear();
slice_sizes.clear();
slice_names.clear();
batch_size = 0;
}
data_slices.emplace_back(iter->second->data.get());
slice_sizes.emplace_back(iter->second->size);
slice_names.emplace_back(remotePrefix + "/" + iter->first);
batch_size += iter->second->size;
}
if (data_slices.size() > 0) {
AddBatchIndexFiles();
}
return true;
}
bool
MemFileManagerImpl::LoadFile(const std::string& filename) noexcept {
return true;
}
std::map<std::string, storage::FieldDataPtr>
MemFileManagerImpl::LoadIndexToMemory(
const std::vector<std::string>& remote_files) {
std::map<std::string, storage::FieldDataPtr> file_to_index_data;
auto parallel_degree =
uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE);
std::vector<std::string> batch_files;
auto LoadBatchIndexFiles = [&]() {
auto index_datas = GetObjectData(rcm_.get(), batch_files);
for (size_t idx = 0; idx < batch_files.size(); ++idx) {
auto file_name =
batch_files[idx].substr(batch_files[idx].find_last_of("/") + 1);
file_to_index_data[file_name] = index_datas[idx];
}
};
for (auto& file : remote_files) {
if (batch_files.size() >= parallel_degree) {
LoadBatchIndexFiles();
batch_files.clear();
}
batch_files.emplace_back(file);
}
if (batch_files.size() > 0) {
LoadBatchIndexFiles();
}
AssertInfo(file_to_index_data.size() == remote_files.size(),
"inconsistent file num and index data num!");
return file_to_index_data;
}
std::vector<FieldDataPtr>
MemFileManagerImpl::CacheRawDataToMemory(
std::vector<std::string> remote_files) {
std::sort(remote_files.begin(),
remote_files.end(),
[](const std::string& a, const std::string& b) {
return std::stol(a.substr(a.find_last_of("/") + 1)) <
std::stol(b.substr(b.find_last_of("/") + 1));
});
auto parallel_degree =
uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE);
std::vector<std::string> batch_files;
std::vector<FieldDataPtr> field_datas;
auto FetchRawData = [&]() {
auto raw_datas = GetObjectData(rcm_.get(), batch_files);
for (auto& data : raw_datas) {
field_datas.emplace_back(data);
}
};
for (auto& file : remote_files) {
if (batch_files.size() >= parallel_degree) {
FetchRawData();
batch_files.clear();
}
batch_files.emplace_back(file);
}
if (batch_files.size() > 0) {
FetchRawData();
}
AssertInfo(field_datas.size() == remote_files.size(),
"inconsistent file num and raw data num!");
return field_datas;
}
std::optional<bool>
MemFileManagerImpl::IsExisted(const std::string& filename) noexcept {
// TODO: implement this interface
return false;
}
bool
MemFileManagerImpl::RemoveFile(const std::string& filename) noexcept {
// TODO: implement this interface
return false;
}
} // namespace milvus::storage

View File

@ -0,0 +1,75 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include <memory>
#include "storage/IndexData.h"
#include "storage/FileManager.h"
#include "storage/ChunkManager.h"
namespace milvus::storage {
class MemFileManagerImpl : public FileManagerImpl {
public:
explicit MemFileManagerImpl(const FieldDataMeta& field_mata,
IndexMeta index_meta,
ChunkManagerPtr remote_chunk_manager);
virtual bool
LoadFile(const std::string& filename) noexcept;
virtual bool
AddFile(const std::string& filename /* unused */) noexcept;
virtual std::optional<bool>
IsExisted(const std::string& filename) noexcept;
virtual bool
RemoveFile(const std::string& filename) noexcept;
public:
virtual std::string
GetName() const {
return "MemIndexFileManagerImpl";
}
std::map<std::string, storage::FieldDataPtr>
LoadIndexToMemory(const std::vector<std::string>& remote_files);
std::vector<FieldDataPtr>
CacheRawDataToMemory(std::vector<std::string> remote_files);
bool
AddFile(const BinarySet& binary_set) noexcept;
std::map<std::string, int64_t>
GetRemotePathsToFileSize() const {
return remote_paths_to_size_;
}
private:
// remote file path
std::map<std::string, int64_t> remote_paths_to_size_;
};
using MemFileManagerImplPtr = std::shared_ptr<MemFileManagerImpl>;
} // namespace milvus::storage

View File

@ -206,6 +206,7 @@ MinioChunkManager::BuildGoogleCloudClient(
MinioChunkManager::MinioChunkManager(const StorageConfig& storage_config)
: default_bucket_name_(storage_config.bucket_name) {
remote_root_path_ = storage_config.root_path;
RemoteStorageType storageType;
if (storage_config.address.find("google") != std::string::npos) {
storageType = RemoteStorageType::GOOGLE_CLOUD;

View File

@ -30,12 +30,12 @@
#include <google/cloud/storage/oauth2/compute_engine_credentials.h>
#include <google/cloud/storage/oauth2/google_credentials.h>
#include <google/cloud/status_or.h>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "config/ConfigChunkManager.h"
#include "storage/ChunkManager.h"
#include "storage/Exception.h"
#include "storage/Types.h"
@ -47,7 +47,7 @@ enum class RemoteStorageType { S3 = 0, GOOGLE_CLOUD = 1, ALIYUN_CLOUD = 2 };
/**
* @brief This MinioChunkManager is responsible for read and write file in S3.
*/
class MinioChunkManager : public RemoteChunkManager {
class MinioChunkManager : public ChunkManager {
public:
explicit MinioChunkManager(const StorageConfig& storage_config);
@ -99,6 +99,11 @@ class MinioChunkManager : public RemoteChunkManager {
return "MinioChunkManager";
}
virtual std::string
GetRootPath() const {
return remote_root_path_;
}
inline std::string
GetBucketName() {
return default_bucket_name_;
@ -163,6 +168,7 @@ class MinioChunkManager : public RemoteChunkManager {
static std::mutex client_mutex_;
std::shared_ptr<Aws::S3::S3Client> client_;
std::string default_bucket_name_;
std::string remote_root_path_;
};
using MinioChunkManagerPtr = std::unique_ptr<MinioChunkManager>;

View File

@ -16,46 +16,72 @@
#include "storage/PayloadReader.h"
#include "exceptions/EasyAssert.h"
#include "storage/FieldDataFactory.h"
#include "storage/Util.h"
#include "parquet/column_reader.h"
#include "arrow/io/api.h"
#include "arrow/status.h"
#include "parquet/arrow/reader.h"
namespace milvus::storage {
PayloadReader::PayloadReader(std::shared_ptr<PayloadInputStream> input,
DataType data_type)
: column_type_(data_type) {
init(std::move(input));
}
PayloadReader::PayloadReader(const uint8_t* data,
int length,
DataType data_type)
: column_type_(data_type) {
auto input = std::make_shared<storage::PayloadInputStream>(data, length);
auto input = std::make_shared<arrow::io::BufferReader>(data, length);
init(input);
}
void
PayloadReader::init(std::shared_ptr<PayloadInputStream> input) {
auto mem_pool = arrow::default_memory_pool();
// TODO :: Stream read file data, avoid copying
std::unique_ptr<parquet::arrow::FileReader> reader;
auto st = parquet::arrow::OpenFile(input, mem_pool, &reader);
AssertInfo(st.ok(), "failed to get arrow file reader");
std::shared_ptr<arrow::Table> table;
st = reader->ReadTable(&table);
AssertInfo(st.ok(), "failed to get reader data to arrow table");
auto column = table->column(0);
AssertInfo(column != nullptr, "returned arrow column is null");
AssertInfo(column->chunks().size() == 1,
"arrow chunk size in arrow column should be 1");
auto array = column->chunk(0);
AssertInfo(array != nullptr, "empty arrow array of PayloadReader");
PayloadReader::init(std::shared_ptr<arrow::io::BufferReader> input) {
arrow::MemoryPool* pool = arrow::default_memory_pool();
// Configure general Parquet reader settings
auto reader_properties = parquet::ReaderProperties(pool);
reader_properties.set_buffer_size(4096 * 4);
reader_properties.enable_buffered_stream();
// Configure Arrow-specific Parquet reader settings
auto arrow_reader_props = parquet::ArrowReaderProperties();
arrow_reader_props.set_batch_size(128 * 1024); // default 64 * 1024
arrow_reader_props.set_pre_buffer(false);
parquet::arrow::FileReaderBuilder reader_builder;
auto st = reader_builder.Open(input, reader_properties);
AssertInfo(st.ok(), "file to read file");
reader_builder.memory_pool(pool);
reader_builder.properties(arrow_reader_props);
std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
st = reader_builder.Build(&arrow_reader);
AssertInfo(st.ok(), "build file reader");
int64_t column_index = 0;
auto file_meta = arrow_reader->parquet_reader()->metadata();
// LOG_SEGCORE_INFO_ << "serialized parquet metadata, num row group " <<
// std::to_string(file_meta->num_row_groups())
// << ", num column " << std::to_string(file_meta->num_columns()) << ", num rows "
// << std::to_string(file_meta->num_rows()) << ", type width "
// << std::to_string(file_meta->schema()->Column(column_index)->type_length());
dim_ = datatype_is_vector(column_type_)
? GetDimensionFromArrowArray(array, column_type_)
? GetDimensionFromFileMetaData(
file_meta->schema()->Column(column_index), column_type_)
: 1;
field_data_ =
FieldDataFactory::GetInstance().CreateFieldData(column_type_, dim_);
auto total_num_rows = file_meta->num_rows();
std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
st = arrow_reader->GetRecordBatchReader(&rb_reader);
AssertInfo(st.ok(), "get record batch reader");
field_data_ = CreateFieldData(column_type_, dim_, total_num_rows);
for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch :
*rb_reader) {
AssertInfo(maybe_batch.ok(), "get batch record success");
auto array = maybe_batch.ValueOrDie()->column(column_index);
field_data_->FillFieldData(array);
}
AssertInfo(field_data_->IsFull(), "field data hasn't been filled done");
// LOG_SEGCORE_INFO_ << "Peak arrow memory pool size " << pool->max_memory();
}
} // namespace milvus::storage

View File

@ -26,15 +26,12 @@ namespace milvus::storage {
class PayloadReader {
public:
explicit PayloadReader(std::shared_ptr<PayloadInputStream> input,
DataType data_type);
explicit PayloadReader(const uint8_t* data, int length, DataType data_type);
~PayloadReader() = default;
void
init(std::shared_ptr<PayloadInputStream> input);
init(std::shared_ptr<arrow::io::BufferReader> buffer);
const FieldDataPtr
get_field_data() const {

View File

@ -32,7 +32,7 @@ class PayloadInputStream;
struct Payload {
DataType data_type;
const uint8_t* raw_data;
int rows;
int64_t rows;
std::optional<int> dimension;
};

View File

@ -0,0 +1,66 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <shared_mutex>
#include "storage/Util.h"
namespace milvus::storage {
class RemoteChunkManagerSingleton {
private:
RemoteChunkManagerSingleton() {
}
public:
RemoteChunkManagerSingleton(const RemoteChunkManagerSingleton&) = delete;
RemoteChunkManagerSingleton&
operator=(const RemoteChunkManagerSingleton&) = delete;
static RemoteChunkManagerSingleton&
GetInstance() {
static RemoteChunkManagerSingleton instance;
return instance;
}
void
Init(const StorageConfig& storage_config) {
std::unique_lock lck(mutex_);
if (rcm_ == nullptr) {
rcm_ = CreateChunkManager(storage_config);
}
}
void
Release() {
std::unique_lock lck(mutex_);
rcm_ = nullptr;
}
ChunkManagerPtr
GetRemoteChunkManager() {
return rcm_;
}
private:
mutable std::shared_mutex mutex_;
ChunkManagerPtr rcm_ = nullptr;
};
} // namespace milvus::storage

View File

@ -36,7 +36,7 @@ class SafeQueue {
return queue_.empty();
}
void
size_t
size() {
std::shared_lock<std::shared_mutex> lock(mutex_);
return queue_.size();

View File

@ -34,7 +34,7 @@ namespace milvus {
class ThreadPool {
public:
explicit ThreadPool(const int thread_core_coefficient) : shutdown_(false) {
auto thread_num = cpu_num * thread_core_coefficient;
auto thread_num = CPU_NUM * thread_core_coefficient;
LOG_SEGCORE_INFO_ << "Thread pool's worker num:" << thread_num;
threads_ = std::vector<std::thread>(thread_num);
Init();
@ -46,7 +46,7 @@ class ThreadPool {
static ThreadPool&
GetInstance() {
static ThreadPool pool(thread_core_coefficient);
static ThreadPool pool(THREAD_CORE_COEFFICIENT);
return pool;
}

View File

@ -86,7 +86,7 @@ struct StorageConfig {
std::string bucket_name = "a-bucket";
std::string access_key_id = "minioadmin";
std::string access_key_value = "minioadmin";
std::string remote_root_path = "files";
std::string root_path = "files";
std::string storage_type = "minio";
std::string iam_endpoint = "";
bool useSSL = false;

View File

@ -19,15 +19,18 @@
#include "arrow/type_fwd.h"
#include "exceptions/EasyAssert.h"
#include "common/Consts.h"
#include "config/ConfigChunkManager.h"
#include "storage/parquet_c.h"
#ifdef BUILD_DISK_ANN
#include "storage/FieldData.h"
#include "storage/ThreadPool.h"
#include "storage/LocalChunkManager.h"
#include "storage/MinioChunkManager.h"
#include "storage/MemFileManagerImpl.h"
#include "storage/DiskFileManagerImpl.h"
#endif
namespace milvus::storage {
std::map<std::string, ChunkManagerType> ChunkManagerType_Map = {
{"local", ChunkManagerType::Local}, {"minio", ChunkManagerType::Minio}};
StorageType
ReadMediumType(BinlogReaderPtr reader) {
AssertInfo(reader->Tell() == 0,
@ -273,6 +276,21 @@ CreateArrowSchema(DataType data_type, int dim) {
}
}
int
GetDimensionFromFileMetaData(const parquet::ColumnDescriptor* schema,
DataType data_type) {
switch (data_type) {
case DataType::VECTOR_FLOAT: {
return schema->type_length() / sizeof(float);
}
case DataType::VECTOR_BINARY: {
return schema->type_length() * 8;
}
default:
PanicInfo("unsupported data type");
}
}
int
GetDimensionFromArrowArray(std::shared_ptr<arrow::Array> data,
DataType data_type) {
@ -299,58 +317,242 @@ GetDimensionFromArrowArray(std::shared_ptr<arrow::Array> data,
}
std::string
GenLocalIndexPathPrefix(int64_t build_id, int64_t index_version) {
return milvus::ChunkMangerConfig::GetLocalRootPath() + "/" +
std::string(INDEX_ROOT_PATH) + "/" + std::to_string(build_id) + "/" +
std::to_string(index_version) + "/";
GenIndexPathPrefix(ChunkManagerPtr cm,
int64_t build_id,
int64_t index_version) {
return cm->GetRootPath() + "/" + std::string(INDEX_ROOT_PATH) + "/" +
std::to_string(build_id) + "/" + std::to_string(index_version) + "/";
}
std::string
GetLocalIndexPathPrefixWithBuildID(int64_t build_id) {
return milvus::ChunkMangerConfig::GetLocalRootPath() + "/" +
std::string(INDEX_ROOT_PATH) + "/" + std::to_string(build_id);
GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id) {
return cm->GetRootPath() + "/" + std::string(INDEX_ROOT_PATH) + "/" +
std::to_string(build_id);
}
std::string
GenFieldRawDataPathPrefix(int64_t segment_id, int64_t field_id) {
return milvus::ChunkMangerConfig::GetLocalRootPath() + "/" +
std::string(RAWDATA_ROOT_PATH) + "/" + std::to_string(segment_id) +
"/" + std::to_string(field_id) + "/";
GenFieldRawDataPathPrefix(ChunkManagerPtr cm,
int64_t segment_id,
int64_t field_id) {
return cm->GetRootPath() + "/" + std::string(RAWDATA_ROOT_PATH) + "/" +
std::to_string(segment_id) + "/" + std::to_string(field_id) + "/";
}
std::string
GetSegmentRawDataPathPrefix(int64_t segment_id) {
return milvus::ChunkMangerConfig::GetLocalRootPath() + "/" +
std::string(RAWDATA_ROOT_PATH) + "/" + std::to_string(segment_id);
GetSegmentRawDataPathPrefix(ChunkManagerPtr cm, int64_t segment_id) {
return cm->GetRootPath() + "/" + std::string(RAWDATA_ROOT_PATH) + "/" +
std::to_string(segment_id);
}
std::vector<IndexType>
DISK_LIST() {
static std::vector<IndexType> ret{
knowhere::IndexEnum::INDEX_DISKANN,
};
return ret;
std::unique_ptr<DataCodec>
DownloadAndDecodeRemoteFile(ChunkManager* chunk_manager,
const std::string& file) {
auto fileSize = chunk_manager->Size(file);
auto buf = std::shared_ptr<uint8_t[]>(new uint8_t[fileSize]);
chunk_manager->Read(file, buf.get(), fileSize);
return DeserializeFileData(buf, fileSize);
}
bool
is_in_disk_list(const IndexType& index_type) {
return is_in_list<IndexType>(index_type, DISK_LIST);
std::pair<std::string, size_t>
EncodeAndUploadIndexSlice(ChunkManager* chunk_manager,
uint8_t* buf,
int64_t batch_size,
IndexMeta index_meta,
FieldDataMeta field_meta,
std::string object_key) {
auto field_data = CreateFieldData(DataType::INT8);
field_data->FillFieldData(buf, batch_size);
auto indexData = std::make_shared<IndexData>(field_data);
indexData->set_index_meta(index_meta);
indexData->SetFieldDataMeta(field_meta);
auto serialized_index_data = indexData->serialize_to_remote_file();
auto serialized_index_size = serialized_index_data.size();
chunk_manager->Write(
object_key, serialized_index_data.data(), serialized_index_size);
return std::make_pair(std::move(object_key), serialized_index_size);
}
// /**
// * Returns the current resident set size (physical memory use) measured
// * in bytes, or zero if the value cannot be determined on this OS.
// */
// size_t
// getCurrentRSS() {
// #if defined(_WIN32)
// /* Windows -------------------------------------------------- */
// PROCESS_MEMORY_COUNTERS info;
// GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
// return (size_t)info.WorkingSetSize;
// #elif defined(__APPLE__) && defined(__MACH__)
// /* OSX ------------------------------------------------------ */
// struct mach_task_basic_info info;
// mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
// if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) != KERN_SUCCESS)
// return (size_t)0L; /* Can't access? */
// return (size_t)info.resident_size;
// #elif defined(__linux__) || defined(__linux) || defined(linux) || defined(__gnu_linux__)
// /* Linux ---------------------------------------------------- */
// long rss = 0L;
// FILE* fp = NULL;
// if ((fp = fopen("/proc/self/statm", "r")) == NULL)
// return (size_t)0L; /* Can't open? */
// if (fscanf(fp, "%*s%ld", &rss) != 1) {
// fclose(fp);
// return (size_t)0L; /* Can't read? */
// }
// fclose(fp);
// return (size_t)rss * (size_t)sysconf(_SC_PAGESIZE);
// #else
// /* AIX, BSD, Solaris, and Unknown OS ------------------------ */
// return (size_t)0L; /* Unsupported. */
// #endif
// }
std::vector<FieldDataPtr>
GetObjectData(ChunkManager* remote_chunk_manager,
const std::vector<std::string>& remote_files) {
auto& pool = ThreadPool::GetInstance();
std::vector<std::future<std::unique_ptr<DataCodec>>> futures;
for (auto& file : remote_files) {
futures.emplace_back(pool.Submit(
DownloadAndDecodeRemoteFile, remote_chunk_manager, file));
}
std::vector<FieldDataPtr> datas;
for (int i = 0; i < futures.size(); ++i) {
auto res = futures[i].get();
datas.emplace_back(res->GetFieldData());
}
ReleaseArrowUnused();
return datas;
}
std::map<std::string, int64_t>
PutIndexData(ChunkManager* remote_chunk_manager,
const std::vector<const uint8_t*>& data_slices,
const std::vector<int64_t>& slice_sizes,
const std::vector<std::string>& slice_names,
FieldDataMeta& field_meta,
IndexMeta& index_meta) {
auto& pool = ThreadPool::GetInstance();
std::vector<std::future<std::pair<std::string, size_t>>> futures;
AssertInfo(data_slices.size() == slice_sizes.size(),
"inconsistent size of data slices with slice sizes!");
AssertInfo(data_slices.size() == slice_names.size(),
"inconsistent size of data slices with slice names!");
for (int64_t i = 0; i < data_slices.size(); ++i) {
futures.push_back(pool.Submit(EncodeAndUploadIndexSlice,
remote_chunk_manager,
const_cast<uint8_t*>(data_slices[i]),
slice_sizes[i],
index_meta,
field_meta,
slice_names[i]));
}
std::map<std::string, int64_t> remote_paths_to_size;
for (auto& future : futures) {
auto res = future.get();
remote_paths_to_size[res.first] = res.second;
}
ReleaseArrowUnused();
return remote_paths_to_size;
}
int64_t
GetTotalNumRowsForFieldDatas(const std::vector<FieldDataPtr>& field_datas) {
int64_t count = 0;
for (auto& field_data : field_datas) {
count += field_data->get_num_rows();
}
return count;
}
void
ReleaseArrowUnused() {
static std::mutex release_mutex;
// While multiple threads are releasing memory,
// we don't need everyone do releasing,
// just let some of them do this also works well
if (release_mutex.try_lock()) {
arrow::default_memory_pool()->ReleaseUnused();
release_mutex.unlock();
}
}
ChunkManagerPtr
CreateChunkManager(const StorageConfig& storage_config) {
auto storage_type = ChunkManagerType_Map[storage_config.storage_type];
switch (storage_type) {
case ChunkManagerType::Local: {
return std::make_shared<LocalChunkManager>(
storage_config.root_path);
}
case ChunkManagerType::Minio: {
return std::make_shared<MinioChunkManager>(storage_config);
}
default: {
PanicInfo("unsupported");
}
}
}
FileManagerImplPtr
CreateFileManager(IndexType index_type,
const FieldDataMeta& field_meta,
const IndexMeta& index_meta,
const StorageConfig& storage_config) {
// TODO :: switch case index type to create file manager
#ifdef BUILD_DISK_ANN
ChunkManagerPtr cm) {
if (is_in_disk_list(index_type)) {
return std::make_shared<DiskFileManagerImpl>(
field_meta, index_meta, storage_config);
field_meta, index_meta, cm);
}
#endif
return nullptr;
return std::make_shared<MemFileManagerImpl>(field_meta, index_meta, cm);
}
FieldDataPtr
CreateFieldData(const DataType& type, int64_t dim, int64_t total_num_rows) {
switch (type) {
case DataType::BOOL:
return std::make_shared<FieldData<bool>>(type, total_num_rows);
case DataType::INT8:
return std::make_shared<FieldData<int8_t>>(type, total_num_rows);
case DataType::INT16:
return std::make_shared<FieldData<int16_t>>(type, total_num_rows);
case DataType::INT32:
return std::make_shared<FieldData<int32_t>>(type, total_num_rows);
case DataType::INT64:
return std::make_shared<FieldData<int64_t>>(type, total_num_rows);
case DataType::FLOAT:
return std::make_shared<FieldData<float>>(type, total_num_rows);
case DataType::DOUBLE:
return std::make_shared<FieldData<double>>(type, total_num_rows);
case DataType::STRING:
case DataType::VARCHAR:
return std::make_shared<FieldData<std::string>>(type,
total_num_rows);
case DataType::JSON:
return std::make_shared<FieldData<Json>>(type, total_num_rows);
case DataType::VECTOR_FLOAT:
return std::make_shared<FieldData<FloatVector>>(
dim, type, total_num_rows);
case DataType::VECTOR_BINARY:
return std::make_shared<FieldData<BinaryVector>>(
dim, type, total_num_rows);
default:
throw NotSupportedDataTypeException(
"CreateFieldData not support data type " + datatype_name(type));
}
}
} // namespace milvus::storage

View File

@ -23,7 +23,10 @@
#include "storage/PayloadStream.h"
#include "storage/FileManager.h"
#include "storage/BinlogReader.h"
#include "storage/ChunkManager.h"
#include "storage/DataCodec.h"
#include "knowhere/comp/index_param.h"
#include "parquet/schema.h"
namespace milvus::storage {
@ -55,36 +58,73 @@ CreateArrowSchema(DataType data_type);
std::shared_ptr<arrow::Schema>
CreateArrowSchema(DataType data_type, int dim);
int
GetDimensionFromFileMetaData(const parquet::ColumnDescriptor* schema,
DataType data_type);
int
GetDimensionFromArrowArray(std::shared_ptr<arrow::Array> array,
DataType data_type);
std::string
GetLocalIndexPathPrefixWithBuildID(int64_t build_id);
GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id);
std::string
GenLocalIndexPathPrefix(int64_t build_id, int64_t index_version);
GenIndexPathPrefix(ChunkManagerPtr cm, int64_t build_id, int64_t index_version);
std::string
GenFieldRawDataPathPrefix(int64_t segment_id, int64_t field_id);
GenFieldRawDataPathPrefix(ChunkManagerPtr cm,
int64_t segment_id,
int64_t field_id);
std::string
GetSegmentRawDataPathPrefix(int64_t segment_id);
GetSegmentRawDataPathPrefix(ChunkManagerPtr cm, int64_t segment_id);
template <typename T>
inline bool
is_in_list(const T& t, std::function<std::vector<T>()> list_func) {
auto l = list_func();
return std::find(l.begin(), l.end(), t) != l.end();
}
std::unique_ptr<DataCodec>
DownloadAndDecodeRemoteFile(ChunkManager* chunk_manager,
const std::string& file);
bool
is_in_disk_list(const IndexType& index_type);
std::pair<std::string, size_t>
EncodeAndUploadIndexSlice(ChunkManager* chunk_manager,
uint8_t* buf,
int64_t batch_size,
IndexMeta index_meta,
FieldDataMeta field_meta,
std::string object_key);
std::vector<FieldDataPtr>
GetObjectData(ChunkManager* remote_chunk_manager,
const std::vector<std::string>& remote_files);
std::map<std::string, int64_t>
PutIndexData(ChunkManager* remote_chunk_manager,
const std::vector<const uint8_t*>& data_slices,
const std::vector<int64_t>& slice_sizes,
const std::vector<std::string>& slice_names,
FieldDataMeta& field_meta,
IndexMeta& index_meta);
int64_t
GetTotalNumRowsForFieldDatas(const std::vector<FieldDataPtr>& field_datas);
void
ReleaseArrowUnused();
// size_t
// getCurrentRSS();
ChunkManagerPtr
CreateChunkManager(const StorageConfig& storage_config);
FileManagerImplPtr
CreateFileManager(IndexType index_type,
const FieldDataMeta& field_meta,
const IndexMeta& index_meta,
const StorageConfig& storage_config);
ChunkManagerPtr cm);
FieldDataPtr
CreateFieldData(const DataType& type,
int64_t dim = 1,
int64_t total_num_rows = 0);
} // namespace milvus::storage

View File

@ -21,24 +21,12 @@
#include "storage/PayloadWriter.h"
#include "storage/FieldData.h"
#include "common/CGoHelper.h"
#include "storage/Util.h"
using Payload = milvus::storage::Payload;
using PayloadWriter = milvus::storage::PayloadWriter;
using PayloadReader = milvus::storage::PayloadReader;
void
ReleaseArrowUnused() {
static std::mutex release_mutex;
// While multiple threads are releasing memory,
// we don't need everyone do releasing,
// just let some of them do this also works well
if (release_mutex.try_lock()) {
arrow::default_memory_pool()->ReleaseUnused();
release_mutex.unlock();
}
}
extern "C" CPayloadWriter
NewPayloadWriter(int columnType) {
auto data_type = static_cast<milvus::DataType>(columnType);
@ -227,7 +215,7 @@ ReleasePayloadWriter(CPayloadWriter handler) {
auto p = reinterpret_cast<PayloadWriter*>(handler);
if (p != nullptr) {
delete p;
ReleaseArrowUnused();
milvus::storage::ReleaseArrowUnused();
}
}
@ -378,8 +366,9 @@ GetOneStringFromPayload(CPayloadReader payloadReader,
try {
auto p = reinterpret_cast<PayloadReader*>(payloadReader);
auto field_data = p->get_field_data();
*cstr = (char*)(const_cast<void*>(field_data->RawValue(idx)));
*str_size = field_data->get_element_size(idx);
auto str = const_cast<void*>(field_data->RawValue(idx));
*cstr = (char*)(*static_cast<std::string*>(str)).c_str();
*str_size = field_data->Size(idx);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
@ -434,7 +423,8 @@ ReleasePayloadReader(CPayloadReader payloadReader) {
"released payloadReader should not be null pointer");
auto p = reinterpret_cast<PayloadReader*>(payloadReader);
delete (p);
ReleaseArrowUnused();
milvus::storage::ReleaseArrowUnused();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());

View File

@ -15,28 +15,67 @@
// limitations under the License.
#include "storage/storage_c.h"
#include "config/ConfigChunkManager.h"
#include "common/CGoHelper.h"
#ifdef BUILD_DISK_ANN
#include "storage/LocalChunkManager.h"
#endif
#include "storage/RemoteChunkManagerSingleton.h"
#include "storage/LocalChunkManagerSingleton.h"
CStatus
GetLocalUsedSize(int64_t* size) {
GetLocalUsedSize(const char* c_dir, int64_t* size) {
try {
#ifdef BUILD_DISK_ANN
auto& local_chunk_manager =
milvus::storage::LocalChunkManager::GetInstance();
auto dir = milvus::ChunkMangerConfig::GetLocalRootPath();
if (local_chunk_manager.DirExist(dir)) {
*size = local_chunk_manager.GetSizeOfDir(dir);
auto local_chunk_manager =
milvus::storage::LocalChunkManagerSingleton::GetInstance()
.GetChunkManager();
std::string dir(c_dir);
if (local_chunk_manager->DirExist(dir)) {
*size = local_chunk_manager->GetSizeOfDir(dir);
} else {
*size = 0;
}
#endif
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
}
}
CStatus
InitLocalChunkManagerSingleton(const char* c_path) {
try {
std::string path(c_path);
milvus::storage::LocalChunkManagerSingleton::GetInstance().Init(path);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
}
}
CStatus
InitRemoteChunkManagerSingleton(CStorageConfig c_storage_config) {
try {
milvus::storage::StorageConfig storage_config;
storage_config.address = std::string(c_storage_config.address);
storage_config.bucket_name = std::string(c_storage_config.bucket_name);
storage_config.access_key_id =
std::string(c_storage_config.access_key_id);
storage_config.access_key_value =
std::string(c_storage_config.access_key_value);
storage_config.root_path = std::string(c_storage_config.root_path);
storage_config.storage_type =
std::string(c_storage_config.storage_type);
storage_config.iam_endpoint =
std::string(c_storage_config.iam_endpoint);
storage_config.useSSL = c_storage_config.useSSL;
storage_config.useIAM = c_storage_config.useIAM;
milvus::storage::RemoteChunkManagerSingleton::GetInstance().Init(
storage_config);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
}
}
void
CleanRemoteChunkManagerSingleton() {
milvus::storage::RemoteChunkManagerSingleton::GetInstance().Release();
}

View File

@ -22,7 +22,16 @@ extern "C" {
#include "common/type_c.h"
CStatus
GetLocalUsedSize(int64_t* size);
GetLocalUsedSize(const char* c_path, int64_t* size);
CStatus
InitLocalChunkManagerSingleton(const char* path);
CStatus
InitRemoteChunkManagerSingleton(CStorageConfig c_storage_config);
void
CleanRemoteChunkManagerSingleton();
#ifdef __cplusplus
};

Some files were not shown because too many files have changed in this diff Show More