feat: support inverted index for array (#33452)

issue: https://github.com/milvus-io/milvus/issues/27704

---------

Signed-off-by: longjiquan <jiquan.long@zilliz.com>
This commit is contained in:
Jiquan Long 2024-05-31 09:47:47 +08:00 committed by GitHub
parent 23dedc2cbf
commit 0c5d8660aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
33 changed files with 875 additions and 365 deletions

View File

@ -23,7 +23,14 @@ namespace exec {
void
PhyJsonContainsFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
switch (expr_->column_.data_type_) {
case DataType::ARRAY:
case DataType::ARRAY: {
if (is_index_mode_) {
result = EvalArrayContainsForIndexSegment();
} else {
result = EvalJsonContainsForDataSegment();
}
break;
}
case DataType::JSON: {
if (is_index_mode_) {
PanicInfo(
@ -94,7 +101,6 @@ PhyJsonContainsFilterExpr::EvalJsonContainsForDataSegment() {
return ExecJsonContainsWithDiffType();
}
}
break;
}
case proto::plan::JSONContainsExpr_JSONOp_ContainsAll: {
if (IsArrayDataType(data_type)) {
@ -145,7 +151,6 @@ PhyJsonContainsFilterExpr::EvalJsonContainsForDataSegment() {
return ExecJsonContainsAllWithDiffType();
}
}
break;
}
default:
PanicInfo(ExprInvalid,
@ -748,5 +753,92 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType() {
return res_vec;
}
VectorPtr
PhyJsonContainsFilterExpr::EvalArrayContainsForIndexSegment() {
switch (expr_->column_.element_type_) {
case DataType::BOOL: {
return ExecArrayContainsForIndexSegmentImpl<bool>();
}
case DataType::INT8: {
return ExecArrayContainsForIndexSegmentImpl<int8_t>();
}
case DataType::INT16: {
return ExecArrayContainsForIndexSegmentImpl<int16_t>();
}
case DataType::INT32: {
return ExecArrayContainsForIndexSegmentImpl<int32_t>();
}
case DataType::INT64: {
return ExecArrayContainsForIndexSegmentImpl<int64_t>();
}
case DataType::FLOAT: {
return ExecArrayContainsForIndexSegmentImpl<float>();
}
case DataType::DOUBLE: {
return ExecArrayContainsForIndexSegmentImpl<double>();
}
case DataType::VARCHAR:
case DataType::STRING: {
return ExecArrayContainsForIndexSegmentImpl<std::string>();
}
default:
PanicInfo(DataTypeInvalid,
fmt::format("unsupported data type for "
"ExecArrayContainsForIndexSegmentImpl: {}",
expr_->column_.element_type_));
}
}
template <typename ExprValueType>
VectorPtr
PhyJsonContainsFilterExpr::ExecArrayContainsForIndexSegmentImpl() {
typedef std::conditional_t<std::is_same_v<ExprValueType, std::string_view>,
std::string,
ExprValueType>
GetType;
using Index = index::ScalarIndex<GetType>;
auto real_batch_size = GetNextBatchSize();
if (real_batch_size == 0) {
return nullptr;
}
std::unordered_set<GetType> elements;
for (auto const& element : expr_->vals_) {
elements.insert(GetValueFromProto<GetType>(element));
}
boost::container::vector<GetType> elems(elements.begin(), elements.end());
auto execute_sub_batch =
[this](Index* index_ptr,
const boost::container::vector<GetType>& vals) {
switch (expr_->op_) {
case proto::plan::JSONContainsExpr_JSONOp_Contains:
case proto::plan::JSONContainsExpr_JSONOp_ContainsAny: {
return index_ptr->In(vals.size(), vals.data());
}
case proto::plan::JSONContainsExpr_JSONOp_ContainsAll: {
TargetBitmap result(index_ptr->Count());
result.set();
for (size_t i = 0; i < vals.size(); i++) {
auto sub = index_ptr->In(1, &vals[i]);
result &= sub;
}
return result;
}
default:
PanicInfo(
ExprInvalid,
"unsupported array contains type {}",
proto::plan::JSONContainsExpr_JSONOp_Name(expr_->op_));
}
};
auto res = ProcessIndexChunks<GetType>(execute_sub_batch, elems);
AssertInfo(res.size() == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
res.size(),
real_batch_size);
return std::make_shared<ColumnVector>(std::move(res));
}
} //namespace exec
} // namespace milvus

View File

@ -80,6 +80,13 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
VectorPtr
ExecJsonContainsWithDiffType();
VectorPtr
EvalArrayContainsForIndexSegment();
template <typename ExprValueType>
VectorPtr
ExecArrayContainsForIndexSegmentImpl();
private:
std::shared_ptr<const milvus::expr::JsonContainsExpr> expr_;
};

View File

@ -113,11 +113,13 @@ IsMaterializedViewSupported(const DataType& data_type) {
struct ColumnInfo {
FieldId field_id_;
DataType data_type_;
DataType element_type_;
std::vector<std::string> nested_path_;
ColumnInfo(const proto::plan::ColumnInfo& column_info)
: field_id_(column_info.field_id()),
data_type_(static_cast<DataType>(column_info.data_type())),
element_type_(static_cast<DataType>(column_info.element_type())),
nested_path_(column_info.nested_path().begin(),
column_info.nested_path().end()) {
}
@ -127,6 +129,7 @@ struct ColumnInfo {
std::vector<std::string> nested_path = {})
: field_id_(field_id),
data_type_(data_type),
element_type_(DataType::NONE),
nested_path_(std::move(nested_path)) {
}
@ -140,6 +143,10 @@ struct ColumnInfo {
return false;
}
if (element_type_ != other.element_type_) {
return false;
}
for (int i = 0; i < nested_path_.size(); ++i) {
if (nested_path_[i] != other.nested_path_[i]) {
return false;
@ -151,10 +158,12 @@ struct ColumnInfo {
std::string
ToString() const {
return fmt::format("[FieldId:{}, data_type:{}, nested_path:{}]",
std::to_string(field_id_.get()),
data_type_,
milvus::Join<std::string>(nested_path_, ","));
return fmt::format(
"[FieldId:{}, data_type:{}, element_type:{}, nested_path:{}]",
std::to_string(field_id_.get()),
data_type_,
element_type_,
milvus::Join<std::string>(nested_path_, ","));
}
};

View File

@ -35,13 +35,9 @@ template <typename T>
ScalarIndexPtr<T>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
DataType d_type) {
const storage::FileManagerContext& file_manager_context) {
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<T>>(cfg,
file_manager_context);
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<T>>(file_manager_context);
@ -60,14 +56,11 @@ template <>
ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
DataType d_type) {
const storage::FileManagerContext& file_manager_context) {
#if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<std::string>>(
cfg, file_manager_context);
file_manager_context);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<std::string>>(
@ -84,13 +77,10 @@ ScalarIndexPtr<T>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space,
DataType d_type) {
std::shared_ptr<milvus_storage::Space> space) {
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<T>>(
cfg, file_manager_context, space);
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context,
space);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<T>>(file_manager_context,
@ -104,14 +94,11 @@ ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space,
DataType d_type) {
std::shared_ptr<milvus_storage::Space> space) {
#if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<std::string>>(
cfg, file_manager_context, space);
file_manager_context, space);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<std::string>>(
@ -148,41 +135,32 @@ IndexFactory::CreateIndex(
}
IndexBasePtr
IndexFactory::CreateScalarIndex(
const CreateIndexInfo& create_index_info,
IndexFactory::CreatePrimitiveScalarIndex(
DataType data_type,
IndexType index_type,
const storage::FileManagerContext& file_manager_context) {
auto data_type = create_index_info.field_type;
auto index_type = create_index_info.index_type;
switch (data_type) {
// create scalar index
case DataType::BOOL:
return CreateScalarIndex<bool>(
index_type, file_manager_context, data_type);
return CreateScalarIndex<bool>(index_type, file_manager_context);
case DataType::INT8:
return CreateScalarIndex<int8_t>(
index_type, file_manager_context, data_type);
return CreateScalarIndex<int8_t>(index_type, file_manager_context);
case DataType::INT16:
return CreateScalarIndex<int16_t>(
index_type, file_manager_context, data_type);
return CreateScalarIndex<int16_t>(index_type, file_manager_context);
case DataType::INT32:
return CreateScalarIndex<int32_t>(
index_type, file_manager_context, data_type);
return CreateScalarIndex<int32_t>(index_type, file_manager_context);
case DataType::INT64:
return CreateScalarIndex<int64_t>(
index_type, file_manager_context, data_type);
return CreateScalarIndex<int64_t>(index_type, file_manager_context);
case DataType::FLOAT:
return CreateScalarIndex<float>(
index_type, file_manager_context, data_type);
return CreateScalarIndex<float>(index_type, file_manager_context);
case DataType::DOUBLE:
return CreateScalarIndex<double>(
index_type, file_manager_context, data_type);
return CreateScalarIndex<double>(index_type, file_manager_context);
// create string index
case DataType::STRING:
case DataType::VARCHAR:
return CreateScalarIndex<std::string>(
index_type, file_manager_context, data_type);
return CreateScalarIndex<std::string>(index_type,
file_manager_context);
default:
throw SegcoreError(
DataTypeInvalid,
@ -190,6 +168,24 @@ IndexFactory::CreateScalarIndex(
}
}
IndexBasePtr
IndexFactory::CreateScalarIndex(
const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) {
switch (create_index_info.field_type) {
case DataType::ARRAY:
return CreatePrimitiveScalarIndex(
static_cast<DataType>(
file_manager_context.fieldDataMeta.schema.element_type()),
create_index_info.index_type,
file_manager_context);
default:
return CreatePrimitiveScalarIndex(create_index_info.field_type,
create_index_info.index_type,
file_manager_context);
}
}
IndexBasePtr
IndexFactory::CreateVectorIndex(
const CreateIndexInfo& create_index_info,
@ -257,32 +253,25 @@ IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info,
switch (data_type) {
// create scalar index
case DataType::BOOL:
return CreateScalarIndex<bool>(
index_type, file_manager, space, data_type);
return CreateScalarIndex<bool>(index_type, file_manager, space);
case DataType::INT8:
return CreateScalarIndex<int8_t>(
index_type, file_manager, space, data_type);
return CreateScalarIndex<int8_t>(index_type, file_manager, space);
case DataType::INT16:
return CreateScalarIndex<int16_t>(
index_type, file_manager, space, data_type);
return CreateScalarIndex<int16_t>(index_type, file_manager, space);
case DataType::INT32:
return CreateScalarIndex<int32_t>(
index_type, file_manager, space, data_type);
return CreateScalarIndex<int32_t>(index_type, file_manager, space);
case DataType::INT64:
return CreateScalarIndex<int64_t>(
index_type, file_manager, space, data_type);
return CreateScalarIndex<int64_t>(index_type, file_manager, space);
case DataType::FLOAT:
return CreateScalarIndex<float>(
index_type, file_manager, space, data_type);
return CreateScalarIndex<float>(index_type, file_manager, space);
case DataType::DOUBLE:
return CreateScalarIndex<double>(
index_type, file_manager, space, data_type);
return CreateScalarIndex<double>(index_type, file_manager, space);
// create string index
case DataType::STRING:
case DataType::VARCHAR:
return CreateScalarIndex<std::string>(
index_type, file_manager, space, data_type);
index_type, file_manager, space);
default:
throw SegcoreError(
DataTypeInvalid,

View File

@ -65,6 +65,13 @@ class IndexFactory {
CreateVectorIndex(const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context);
IndexBasePtr
CreatePrimitiveScalarIndex(
DataType data_type,
IndexType index_type,
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
IndexBasePtr
CreateScalarIndex(const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context =
@ -89,15 +96,13 @@ class IndexFactory {
ScalarIndexPtr<T>
CreateScalarIndex(const IndexType& index_type,
const storage::FileManagerContext& file_manager =
storage::FileManagerContext(),
DataType d_type = DataType::NONE);
storage::FileManagerContext());
template <typename T>
ScalarIndexPtr<T>
CreateScalarIndex(const IndexType& index_type,
const storage::FileManagerContext& file_manager,
std::shared_ptr<milvus_storage::Space> space,
DataType d_type = DataType::NONE);
std::shared_ptr<milvus_storage::Space> space);
};
// template <>
@ -112,6 +117,5 @@ ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space,
DataType d_type);
std::shared_ptr<milvus_storage::Space> space);
} // namespace milvus::index

View File

@ -23,12 +23,50 @@
#include "InvertedIndexTantivy.h"
namespace milvus::index {
inline TantivyDataType
get_tantivy_data_type(proto::schema::DataType data_type) {
switch (data_type) {
case proto::schema::DataType::Bool: {
return TantivyDataType::Bool;
}
case proto::schema::DataType::Int8:
case proto::schema::DataType::Int16:
case proto::schema::DataType::Int32:
case proto::schema::DataType::Int64: {
return TantivyDataType::I64;
}
case proto::schema::DataType::Float:
case proto::schema::DataType::Double: {
return TantivyDataType::F64;
}
case proto::schema::DataType::VarChar: {
return TantivyDataType::Keyword;
}
default:
PanicInfo(ErrorCode::NotImplemented,
fmt::format("not implemented data type: {}", data_type));
}
}
inline TantivyDataType
get_tantivy_data_type(const proto::schema::FieldSchema& schema) {
switch (schema.data_type()) {
case proto::schema::Array:
return get_tantivy_data_type(schema.element_type());
default:
return get_tantivy_data_type(schema.data_type());
}
}
template <typename T>
InvertedIndexTantivy<T>::InvertedIndexTantivy(
const TantivyConfig& cfg,
const storage::FileManagerContext& ctx,
std::shared_ptr<milvus_storage::Space> space)
: cfg_(cfg), space_(space) {
: space_(space), schema_(ctx.fieldDataMeta.schema) {
mem_file_manager_ = std::make_shared<MemFileManager>(ctx, ctx.space_);
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx, ctx.space_);
auto field =
@ -36,7 +74,7 @@ InvertedIndexTantivy<T>::InvertedIndexTantivy(
auto prefix = disk_file_manager_->GetLocalIndexObjectPrefix();
path_ = prefix;
boost::filesystem::create_directories(path_);
d_type_ = cfg_.to_tantivy_data_type();
d_type_ = get_tantivy_data_type(schema_);
if (tantivy_index_exist(path_.c_str())) {
LOG_INFO(
"index {} already exists, which should happen in loading progress",
@ -114,83 +152,7 @@ InvertedIndexTantivy<T>::Build(const Config& config) {
AssertInfo(insert_files.has_value(), "insert_files were empty");
auto field_datas =
mem_file_manager_->CacheRawDataToMemory(insert_files.value());
switch (cfg_.data_type_) {
case DataType::BOOL: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<bool>(static_cast<const bool*>(data->Data()),
n);
}
break;
}
case DataType::INT8: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int8_t>(
static_cast<const int8_t*>(data->Data()), n);
}
break;
}
case DataType::INT16: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int16_t>(
static_cast<const int16_t*>(data->Data()), n);
}
break;
}
case DataType::INT32: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int32_t>(
static_cast<const int32_t*>(data->Data()), n);
}
break;
}
case DataType::INT64: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int64_t>(
static_cast<const int64_t*>(data->Data()), n);
}
break;
}
case DataType::FLOAT: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<float>(
static_cast<const float*>(data->Data()), n);
}
break;
}
case DataType::DOUBLE: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<double>(
static_cast<const double*>(data->Data()), n);
}
break;
}
case DataType::VARCHAR: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<std::string>(
static_cast<const std::string*>(data->Data()), n);
}
break;
}
default:
PanicInfo(ErrorCode::NotImplemented,
fmt::format("todo: not supported, {}", cfg_.data_type_));
}
build_index(field_datas);
}
template <typename T>
@ -211,84 +173,7 @@ InvertedIndexTantivy<T>::BuildV2(const Config& config) {
field_data->FillFieldData(col_data);
field_datas.push_back(field_data);
}
switch (cfg_.data_type_) {
case DataType::BOOL: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<bool>(static_cast<const bool*>(data->Data()),
n);
}
break;
}
case DataType::INT8: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int8_t>(
static_cast<const int8_t*>(data->Data()), n);
}
break;
}
case DataType::INT16: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int16_t>(
static_cast<const int16_t*>(data->Data()), n);
}
break;
}
case DataType::INT32: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int32_t>(
static_cast<const int32_t*>(data->Data()), n);
}
break;
}
case DataType::INT64: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int64_t>(
static_cast<const int64_t*>(data->Data()), n);
}
break;
}
case DataType::FLOAT: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<float>(
static_cast<const float*>(data->Data()), n);
}
break;
}
case DataType::DOUBLE: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<double>(
static_cast<const double*>(data->Data()), n);
}
break;
}
case DataType::VARCHAR: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<std::string>(
static_cast<const std::string*>(data->Data()), n);
}
break;
}
default:
PanicInfo(ErrorCode::NotImplemented,
fmt::format("todo: not supported, {}", cfg_.data_type_));
}
build_index(field_datas);
}
template <typename T>
@ -333,7 +218,8 @@ InvertedIndexTantivy<T>::In(size_t n, const T* values) {
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::NotIn(size_t n, const T* values) {
TargetBitmap bitset(Count(), true);
TargetBitmap bitset(Count());
bitset.set();
for (size_t i = 0; i < n; ++i) {
auto array = wrapper_->term_query(values[i]);
apply_hits(bitset, array, false);
@ -425,51 +311,107 @@ void
InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
if constexpr (!std::is_same_v<T, std::string>) {
TantivyConfig cfg;
if constexpr (std::is_same_v<int8_t, T>) {
cfg.data_type_ = DataType::INT8;
if constexpr (std::is_same_v<int8_t, T>) {
schema_.set_data_type(proto::schema::DataType::Int8);
}
if constexpr (std::is_same_v<int16_t, T>) {
schema_.set_data_type(proto::schema::DataType::Int16);
}
if constexpr (std::is_same_v<int32_t, T>) {
schema_.set_data_type(proto::schema::DataType::Int32);
}
if constexpr (std::is_same_v<int64_t, T>) {
schema_.set_data_type(proto::schema::DataType::Int64);
}
if constexpr (std::is_same_v<float, T>) {
schema_.set_data_type(proto::schema::DataType::Float);
}
if constexpr (std::is_same_v<double, T>) {
schema_.set_data_type(proto::schema::DataType::Double);
}
if constexpr (std::is_same_v<std::string, T>) {
schema_.set_data_type(proto::schema::DataType::VarChar);
}
boost::uuids::random_generator generator;
auto uuid = generator();
auto prefix = boost::uuids::to_string(uuid);
path_ = fmt::format("/tmp/{}", prefix);
boost::filesystem::create_directories(path_);
d_type_ = get_tantivy_data_type(schema_);
std::string field = "test_inverted_index";
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str());
wrapper_->add_data<T>(static_cast<const T*>(values), n);
finish();
}
template <typename T>
void
InvertedIndexTantivy<T>::build_index(
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
switch (schema_.data_type()) {
case proto::schema::DataType::Bool:
case proto::schema::DataType::Int8:
case proto::schema::DataType::Int16:
case proto::schema::DataType::Int32:
case proto::schema::DataType::Int64:
case proto::schema::DataType::Float:
case proto::schema::DataType::Double:
case proto::schema::DataType::String:
case proto::schema::DataType::VarChar: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<T>(static_cast<const T*>(data->Data()), n);
}
break;
}
if constexpr (std::is_same_v<int16_t, T>) {
cfg.data_type_ = DataType::INT16;
case proto::schema::DataType::Array: {
build_index_for_array(field_datas);
break;
}
if constexpr (std::is_same_v<int32_t, T>) {
cfg.data_type_ = DataType::INT32;
default:
PanicInfo(ErrorCode::NotImplemented,
fmt::format("Inverted index not supported on {}",
schema_.data_type()));
}
}
template <typename T>
void
InvertedIndexTantivy<T>::build_index_for_array(
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
auto array_column = static_cast<const Array*>(data->Data());
for (int64_t i = 0; i < n; i++) {
assert(array_column[i].get_element_type() ==
static_cast<DataType>(schema_.element_type()));
wrapper_->template add_multi_data(
reinterpret_cast<const T*>(array_column[i].data()),
array_column[i].length());
}
if constexpr (std::is_same_v<int64_t, T>) {
cfg.data_type_ = DataType::INT64;
}
}
template <>
void
InvertedIndexTantivy<std::string>::build_index_for_array(
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
auto array_column = static_cast<const Array*>(data->Data());
for (int64_t i = 0; i < n; i++) {
assert(array_column[i].get_element_type() ==
static_cast<DataType>(schema_.element_type()));
std::vector<std::string> output;
for (int64_t j = 0; j < array_column[i].length(); j++) {
output.push_back(
array_column[i].template get_data<std::string>(j));
}
wrapper_->template add_multi_data(output.data(), output.size());
}
if constexpr (std::is_same_v<std::string, T>) {
cfg.data_type_ = DataType::VARCHAR;
}
boost::uuids::random_generator generator;
auto uuid = generator();
auto prefix = boost::uuids::to_string(uuid);
path_ = fmt::format("/tmp/{}", prefix);
boost::filesystem::create_directories(path_);
cfg_ = cfg;
d_type_ = cfg_.to_tantivy_data_type();
std::string field = "test_inverted_index";
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str());
wrapper_->add_data<T>(static_cast<const T*>(values), n);
finish();
} else {
boost::uuids::random_generator generator;
auto uuid = generator();
auto prefix = boost::uuids::to_string(uuid);
path_ = fmt::format("/tmp/{}", prefix);
boost::filesystem::create_directories(path_);
cfg_ = TantivyConfig{
.data_type_ = DataType::VARCHAR,
};
d_type_ = cfg_.to_tantivy_data_type();
std::string field = "test_inverted_index";
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str());
wrapper_->add_data<std::string>(static_cast<const std::string*>(values),
n);
finish();
}
}

View File

@ -18,7 +18,6 @@
#include "tantivy-binding.h"
#include "tantivy-wrapper.h"
#include "index/StringIndex.h"
#include "index/TantivyConfig.h"
#include "storage/space.h"
namespace milvus::index {
@ -36,13 +35,11 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
InvertedIndexTantivy() = default;
explicit InvertedIndexTantivy(const TantivyConfig& cfg,
const storage::FileManagerContext& ctx)
: InvertedIndexTantivy(cfg, ctx, nullptr) {
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx)
: InvertedIndexTantivy(ctx, nullptr) {
}
explicit InvertedIndexTantivy(const TantivyConfig& cfg,
const storage::FileManagerContext& ctx,
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx,
std::shared_ptr<milvus_storage::Space> space);
~InvertedIndexTantivy();
@ -160,11 +157,18 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
void
finish();
void
build_index(const std::vector<std::shared_ptr<FieldDataBase>>& field_datas);
void
build_index_for_array(
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas);
private:
std::shared_ptr<TantivyIndexWrapper> wrapper_;
TantivyConfig cfg_;
TantivyDataType d_type_;
std::string path_;
proto::schema::FieldSchema schema_;
/*
* To avoid IO amplification, we use both mem file manager & disk file manager

View File

@ -1,51 +0,0 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include "storage/Types.h"
#include "tantivy-binding.h"
namespace milvus::index {
struct TantivyConfig {
DataType data_type_;
TantivyDataType
to_tantivy_data_type() {
switch (data_type_) {
case DataType::BOOL: {
return TantivyDataType::Bool;
}
case DataType::INT8:
case DataType::INT16:
case DataType::INT32:
case DataType::INT64: {
return TantivyDataType::I64;
}
case DataType::FLOAT:
case DataType::DOUBLE: {
return TantivyDataType::F64;
}
case DataType::VARCHAR: {
return TantivyDataType::Keyword;
}
default:
PanicInfo(
ErrorCode::NotImplemented,
fmt::format("not implemented data type: {}", data_type_));
}
}
};
} // namespace milvus::index

View File

@ -60,6 +60,7 @@ class IndexFactory {
case DataType::DOUBLE:
case DataType::VARCHAR:
case DataType::STRING:
case DataType::ARRAY:
return CreateScalarIndex(type, config, context);
case DataType::VECTOR_FLOAT:

View File

@ -190,7 +190,8 @@ CreateIndex(CIndex* res_index,
build_index_info->collectionid(),
build_index_info->partitionid(),
build_index_info->segmentid(),
build_index_info->field_schema().fieldid()};
build_index_info->field_schema().fieldid(),
build_index_info->field_schema()};
milvus::storage::IndexMeta index_meta{
build_index_info->segmentid(),

View File

@ -11,12 +11,10 @@
find_package(Protobuf REQUIRED)
file(GLOB_RECURSE milvus_proto_srcs
"${CMAKE_CURRENT_SOURCE_DIR}/*.cc")
add_library(milvus_proto STATIC
common.pb.cc
index_cgo_msg.pb.cc
plan.pb.cc
schema.pb.cc
segcore.pb.cc
${milvus_proto_srcs}
)
message(STATUS "milvus proto sources: " ${milvus_proto_srcs})

View File

@ -46,6 +46,7 @@ struct LoadIndexInfo {
std::string uri;
int64_t index_store_version;
IndexVersion index_engine_version;
proto::schema::FieldSchema schema;
};
} // namespace milvus::segcore

View File

@ -25,6 +25,7 @@
#include "storage/Util.h"
#include "storage/RemoteChunkManagerSingleton.h"
#include "storage/LocalChunkManagerSingleton.h"
#include "pb/cgo_msg.pb.h"
bool
IsLoadWithDisk(const char* index_type, int index_engine_version) {
@ -258,7 +259,8 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) {
load_index_info->collection_id,
load_index_info->partition_id,
load_index_info->segment_id,
load_index_info->field_id};
load_index_info->field_id,
load_index_info->schema};
milvus::storage::IndexMeta index_meta{load_index_info->segment_id,
load_index_info->field_id,
load_index_info->index_build_id,
@ -484,3 +486,50 @@ AppendStorageInfo(CLoadIndexInfo c_load_index_info,
load_index_info->uri = uri;
load_index_info->index_store_version = version;
}
CStatus
FinishLoadIndexInfo(CLoadIndexInfo c_load_index_info,
const uint8_t* serialized_load_index_info,
const uint64_t len) {
try {
auto info_proto = std::make_unique<milvus::proto::cgo::LoadIndexInfo>();
info_proto->ParseFromArray(serialized_load_index_info, len);
auto load_index_info =
static_cast<milvus::segcore::LoadIndexInfo*>(c_load_index_info);
// TODO: keep this since LoadIndexInfo is used by SegmentSealed.
{
load_index_info->collection_id = info_proto->collectionid();
load_index_info->partition_id = info_proto->partitionid();
load_index_info->segment_id = info_proto->segmentid();
load_index_info->field_id = info_proto->field().fieldid();
load_index_info->field_type =
static_cast<milvus::DataType>(info_proto->field().data_type());
load_index_info->enable_mmap = info_proto->enable_mmap();
load_index_info->mmap_dir_path = info_proto->mmap_dir_path();
load_index_info->index_id = info_proto->indexid();
load_index_info->index_build_id = info_proto->index_buildid();
load_index_info->index_version = info_proto->index_version();
for (const auto& [k, v] : info_proto->index_params()) {
load_index_info->index_params[k] = v;
}
load_index_info->index_files.assign(
info_proto->index_files().begin(),
info_proto->index_files().end());
load_index_info->uri = info_proto->uri();
load_index_info->index_store_version =
info_proto->index_store_version();
load_index_info->index_engine_version =
info_proto->index_engine_version();
load_index_info->schema = info_proto->field();
}
auto status = CStatus();
status.error_code = milvus::Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = milvus::UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}

View File

@ -76,6 +76,11 @@ void
AppendStorageInfo(CLoadIndexInfo c_load_index_info,
const char* uri,
int64_t version);
CStatus
FinishLoadIndexInfo(CLoadIndexInfo c_load_index_info,
const uint8_t* serialized_load_index_info,
const uint64_t len);
#ifdef __cplusplus
}
#endif

View File

@ -64,6 +64,7 @@ struct FieldDataMeta {
int64_t partition_id;
int64_t segment_id;
int64_t field_id;
proto::schema::FieldSchema schema;
};
enum CodecType {

View File

@ -71,3 +71,9 @@ target_link_libraries(bench_tantivy
boost_filesystem
dl
)
add_executable(ffi_demo ffi_demo.cpp)
target_link_libraries(ffi_demo
tantivy_binding
dl
)

View File

@ -0,0 +1,17 @@
#include <string>
#include <vector>
#include "tantivy-binding.h"
int
main(int argc, char* argv[]) {
std::vector<std::string> data{"data1", "data2", "data3"};
std::vector<const char*> datas{};
for (auto& s : data) {
datas.push_back(s.c_str());
}
print_vector_of_strings(datas.data(), datas.size());
return 0;
}

View File

@ -97,6 +97,24 @@ void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len);
void tantivy_index_add_keyword(void *ptr, const char *s);
void tantivy_index_add_multi_int8s(void *ptr, const int8_t *array, uintptr_t len);
void tantivy_index_add_multi_int16s(void *ptr, const int16_t *array, uintptr_t len);
void tantivy_index_add_multi_int32s(void *ptr, const int32_t *array, uintptr_t len);
void tantivy_index_add_multi_int64s(void *ptr, const int64_t *array, uintptr_t len);
void tantivy_index_add_multi_f32s(void *ptr, const float *array, uintptr_t len);
void tantivy_index_add_multi_f64s(void *ptr, const double *array, uintptr_t len);
void tantivy_index_add_multi_bools(void *ptr, const bool *array, uintptr_t len);
void tantivy_index_add_multi_keywords(void *ptr, const char *const *array, uintptr_t len);
bool tantivy_index_exist(const char *path);
void print_vector_of_strings(const char *const *ptr, uintptr_t len);
} // extern "C"

View File

@ -0,0 +1,14 @@
use std::{ffi::{c_char, CStr}, slice};
#[no_mangle]
pub extern "C" fn print_vector_of_strings(ptr: *const *const c_char, len: usize) {
let arr : &[*const c_char] = unsafe {
slice::from_raw_parts(ptr, len)
};
for element in arr {
let c_str = unsafe {
CStr::from_ptr(*element)
};
println!("{}", c_str.to_str().unwrap());
}
}

View File

@ -1,10 +1,11 @@
use futures::executor::block_on;
use std::ffi::CStr;
use libc::c_char;
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED};
use tantivy::{doc, tokenizer, Index, IndexWriter, SingleSegmentIndexWriter};
use tantivy::{doc, tokenizer, Index, SingleSegmentIndexWriter, Document};
use crate::data_type::TantivyDataType;
use crate::index_writer;
use crate::log::init_log;
pub struct IndexWriterWrapper {
@ -98,7 +99,74 @@ impl IndexWriterWrapper {
.unwrap();
}
pub fn finish(mut self) {
pub fn add_multi_i8s(&mut self, datas: &[i8]) {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.index_writer.add_document(document).unwrap();
}
pub fn add_multi_i16s(&mut self, datas: &[i16]) {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.index_writer.add_document(document).unwrap();
}
pub fn add_multi_i32s(&mut self, datas: &[i32]) {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.index_writer.add_document(document).unwrap();
}
pub fn add_multi_i64s(&mut self, datas: &[i64]) {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.index_writer.add_document(document).unwrap();
}
pub fn add_multi_f32s(&mut self, datas: &[f32]) {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as f64);
}
self.index_writer.add_document(document).unwrap();
}
pub fn add_multi_f64s(&mut self, datas: &[f64]) {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.index_writer.add_document(document).unwrap();
}
pub fn add_multi_bools(&mut self, datas: &[bool]) {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.index_writer.add_document(document).unwrap();
}
pub fn add_multi_keywords(&mut self, datas: &[*const c_char]) {
let mut document = Document::default();
for element in datas {
let data = unsafe {
CStr::from_ptr(*element)
};
document.add_field_value(self.field, data.to_str().unwrap());
}
self.index_writer.add_document(document).unwrap();
}
pub fn finish(self) {
self.index_writer
.finalize()
.expect("failed to build inverted index");

View File

@ -122,3 +122,77 @@ pub extern "C" fn tantivy_index_add_keyword(ptr: *mut c_void, s: *const c_char)
let c_str = unsafe { CStr::from_ptr(s) };
unsafe { (*real).add_keyword(c_str.to_str().unwrap()) }
}
// --------------------------------------------- array ------------------------------------------
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int8s(ptr: *mut c_void, array: *const i8, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
(*real).add_multi_i8s(arr)
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int16s(ptr: *mut c_void, array: *const i16, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len) ;
(*real).add_multi_i16s(arr);
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int32s(ptr: *mut c_void, array: *const i32, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len) ;
(*real).add_multi_i32s(arr);
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int64s(ptr: *mut c_void, array: *const i64, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len) ;
(*real).add_multi_i64s(arr);
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f32s(ptr: *mut c_void, array: *const f32, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len) ;
(*real).add_multi_f32s(arr);
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f64s(ptr: *mut c_void, array: *const f64, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len) ;
(*real).add_multi_f64s(arr);
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_bools(ptr: *mut c_void, array: *const bool, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len) ;
(*real).add_multi_bools(arr);
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_keywords(ptr: *mut c_void, array: *const *const c_char, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
(*real).add_multi_keywords(arr)
}
}

View File

@ -10,6 +10,7 @@ mod log;
mod util;
mod util_c;
mod vec_collector;
mod demo_c;
pub fn add(left: usize, right: usize) -> usize {
left + right

View File

@ -1,5 +1,7 @@
#include <sstream>
#include <fmt/format.h>
#include <set>
#include <iostream>
#include "tantivy-binding.h"
namespace milvus::tantivy {
@ -49,6 +51,15 @@ struct RustArrayWrapper {
std::cout << ss.str() << std::endl;
}
std::set<uint32_t>
to_set() {
std::set<uint32_t> s;
for (int i = 0; i < array_.len; i++) {
s.insert(array_.array[i]);
}
return s;
}
RustArray array_;
private:
@ -186,6 +197,60 @@ struct TantivyIndexWrapper {
typeid(T).name());
}
template <typename T>
void
add_multi_data(const T* array, uintptr_t len) {
assert(!finished_);
if constexpr (std::is_same_v<T, bool>) {
tantivy_index_add_multi_bools(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, int8_t>) {
tantivy_index_add_multi_int8s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, int16_t>) {
tantivy_index_add_multi_int16s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, int32_t>) {
tantivy_index_add_multi_int32s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, int64_t>) {
tantivy_index_add_multi_int64s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, float>) {
tantivy_index_add_multi_f32s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, double>) {
tantivy_index_add_multi_f64s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, std::string>) {
std::vector<const char*> views;
for (uintptr_t i = 0; i < len; i++) {
views.push_back(array[i].c_str());
}
tantivy_index_add_multi_keywords(writer_, views.data(), len);
return;
}
throw fmt::format(
"InvertedIndex.add_multi_data: unsupported data type: {}",
typeid(T).name());
}
inline void
finish() {
if (!finished_) {

View File

@ -200,6 +200,77 @@ test_32717() {
}
}
template <typename T>
std::map<T, std::set<uint32_t>>
build_inverted_index(const std::vector<std::vector<T>>& vec_of_array) {
std::map<T, std::set<uint32_t>> inverted_index;
for (uint32_t i = 0; i < vec_of_array.size(); i++) {
for (const auto& term : vec_of_array[i]) {
inverted_index[term].insert(i);
}
}
return inverted_index;
}
void
test_array_int() {
using T = int64_t;
auto path = "/tmp/inverted-index/test-binding/";
boost::filesystem::remove_all(path);
boost::filesystem::create_directories(path);
auto w = TantivyIndexWrapper("test_field_name", guess_data_type<T>(), path);
std::vector<std::vector<T>> vec_of_array{
{10, 40, 50},
{20, 50},
{10, 50, 60},
};
for (const auto& arr : vec_of_array) {
w.add_multi_data(arr.data(), arr.size());
}
w.finish();
assert(w.count() == vec_of_array.size());
auto inverted_index = build_inverted_index(vec_of_array);
for (const auto& [term, posting_list] : inverted_index) {
auto hits = w.term_query(term).to_set();
assert(posting_list == hits);
}
}
void
test_array_string() {
using T = std::string;
auto path = "/tmp/inverted-index/test-binding/";
boost::filesystem::remove_all(path);
boost::filesystem::create_directories(path);
auto w =
TantivyIndexWrapper("test_field_name", TantivyDataType::Keyword, path);
std::vector<std::vector<T>> vec_of_array{
{"10", "40", "50"},
{"20", "50"},
{"10", "50", "60"},
};
for (const auto& arr : vec_of_array) {
w.add_multi_data(arr.data(), arr.size());
}
w.finish();
assert(w.count() == vec_of_array.size());
auto inverted_index = build_inverted_index(vec_of_array);
for (const auto& [term, posting_list] : inverted_index) {
auto hits = w.term_query(term).to_set();
assert(posting_list == hits);
}
}
int
main(int argc, char* argv[]) {
test_32717();
@ -216,5 +287,8 @@ main(int argc, char* argv[]) {
run<std::string>();
test_array_int();
test_array_string();
return 0;
}

View File

@ -32,13 +32,20 @@ auto
gen_field_meta(int64_t collection_id = 1,
int64_t partition_id = 2,
int64_t segment_id = 3,
int64_t field_id = 101) -> storage::FieldDataMeta {
return storage::FieldDataMeta{
int64_t field_id = 101,
DataType data_type = DataType::NONE,
DataType element_type = DataType::NONE)
-> storage::FieldDataMeta {
auto meta = storage::FieldDataMeta{
.collection_id = collection_id,
.partition_id = partition_id,
.segment_id = segment_id,
.field_id = field_id,
};
meta.schema.set_data_type(static_cast<proto::schema::DataType>(data_type));
meta.schema.set_element_type(
static_cast<proto::schema::DataType>(element_type));
return meta;
}
auto
@ -86,7 +93,7 @@ struct ChunkManagerWrapper {
};
} // namespace milvus::test
template <typename T, DataType dtype>
template <typename T, DataType dtype, DataType element_type = DataType::NONE>
void
test_run() {
int64_t collection_id = 1;
@ -96,8 +103,8 @@ test_run() {
int64_t index_build_id = 1000;
int64_t index_version = 10000;
auto field_meta =
test::gen_field_meta(collection_id, partition_id, segment_id, field_id);
auto field_meta = test::gen_field_meta(
collection_id, partition_id, segment_id, field_id, dtype, element_type);
auto index_meta = test::gen_index_meta(
segment_id, field_id, index_build_id, index_version);
@ -305,8 +312,12 @@ test_string() {
int64_t index_build_id = 1000;
int64_t index_version = 10000;
auto field_meta =
test::gen_field_meta(collection_id, partition_id, segment_id, field_id);
auto field_meta = test::gen_field_meta(collection_id,
partition_id,
segment_id,
field_id,
dtype,
DataType::NONE);
auto index_meta = test::gen_index_meta(
segment_id, field_id, index_build_id, index_version);

View File

@ -53,6 +53,14 @@ TYPED_TEST_P(TypedScalarIndexTest, Dummy) {
std::cout << milvus::GetDType<T>() << std::endl;
}
auto
GetTempFileManagerCtx(CDataType data_type) {
auto ctx = milvus::storage::FileManagerContext();
ctx.fieldDataMeta.schema.set_data_type(
static_cast<milvus::proto::schema::DataType>(data_type));
return ctx;
}
TYPED_TEST_P(TypedScalarIndexTest, Constructor) {
using T = TypeParam;
auto dtype = milvus::GetDType<T>();
@ -63,7 +71,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Constructor) {
create_index_info.index_type = index_type;
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
}
}
@ -77,7 +85,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Count) {
create_index_info.index_type = index_type;
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenSortedArr<T>(nb);
@ -96,7 +104,7 @@ TYPED_TEST_P(TypedScalarIndexTest, HasRawData) {
create_index_info.index_type = index_type;
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenSortedArr<T>(nb);
@ -116,7 +124,7 @@ TYPED_TEST_P(TypedScalarIndexTest, In) {
create_index_info.index_type = index_type;
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenSortedArr<T>(nb);
@ -135,7 +143,7 @@ TYPED_TEST_P(TypedScalarIndexTest, NotIn) {
create_index_info.index_type = index_type;
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenSortedArr<T>(nb);
@ -154,7 +162,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Reverse) {
create_index_info.index_type = index_type;
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenSortedArr<T>(nb);
@ -173,7 +181,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Range) {
create_index_info.index_type = index_type;
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenSortedArr<T>(nb);
@ -192,7 +200,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) {
create_index_info.index_type = index_type;
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenSortedArr<T>(nb);
@ -201,7 +209,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) {
auto binary_set = index->Serialize(nullptr);
auto copy_index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info);
create_index_info, GetTempFileManagerCtx(dtype));
copy_index->Load(binary_set);
auto copy_scalar_index =
@ -372,6 +380,8 @@ TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
auto space = TestSpace<T>(temp_path, vec_size, dataset, scalars);
milvus::storage::FileManagerContext file_manager_context(
{}, {.field_name = "scalar"}, chunk_manager, space);
file_manager_context.fieldDataMeta.schema.set_data_type(
static_cast<milvus::proto::schema::DataType>(dtype));
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info, file_manager_context, space);

View File

@ -0,0 +1,23 @@
syntax = "proto3";
package milvus.proto.cgo;
option go_package="github.com/milvus-io/milvus/internal/proto/cgopb";
import "schema.proto";
message LoadIndexInfo {
int64 collectionID = 1;
int64 partitionID = 2;
int64 segmentID = 3;
schema.FieldSchema field = 5;
bool enable_mmap = 6;
string mmap_dir_path = 7;
int64 indexID = 8;
int64 index_buildID = 9;
int64 index_version = 10;
map<string, string> index_params = 11;
repeated string index_files = 12;
string uri = 13;
int64 index_store_version = 14;
int32 index_engine_version = 15;
}

View File

@ -29,11 +29,13 @@ import (
"runtime"
"unsafe"
"github.com/golang/protobuf/proto"
"github.com/pingcap/log"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/datacoord"
"github.com/milvus-io/milvus/internal/proto/cgopb"
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/pkg/common"
@ -245,3 +247,33 @@ func (li *LoadIndexInfo) appendIndexEngineVersion(ctx context.Context, indexEngi
return HandleCStatus(ctx, &status, "AppendIndexEngineVersion failed")
}
func (li *LoadIndexInfo) finish(ctx context.Context, info *cgopb.LoadIndexInfo) error {
marshaled, err := proto.Marshal(info)
if err != nil {
return err
}
var status C.CStatus
_, _ = GetDynamicPool().Submit(func() (any, error) {
status = C.FinishLoadIndexInfo(li.cLoadIndexInfo, (*C.uint8_t)(unsafe.Pointer(&marshaled[0])), (C.uint64_t)(len(marshaled)))
return nil, nil
}).Await()
if err := HandleCStatus(ctx, &status, "FinishLoadIndexInfo failed"); err != nil {
return err
}
_, _ = GetLoadPool().Submit(func() (any, error) {
if paramtable.Get().CommonCfg.EnableStorageV2.GetAsBool() {
status = C.AppendIndexV3(li.cLoadIndexInfo)
} else {
traceCtx := ParseCTraceContext(ctx)
status = C.AppendIndexV2(traceCtx.ctx, li.cLoadIndexInfo)
runtime.KeepAlive(traceCtx)
}
return nil, nil
}).Await()
return HandleCStatus(ctx, &status, "AppendIndex failed")
}

View File

@ -45,6 +45,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
milvus_storage "github.com/milvus-io/milvus-storage/go/storage"
"github.com/milvus-io/milvus-storage/go/storage/options"
"github.com/milvus-io/milvus/internal/proto/cgopb"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/proto/segcorepb"
@ -56,6 +57,9 @@ import (
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/metrics"
"github.com/milvus-io/milvus/pkg/util/funcutil"
"github.com/milvus-io/milvus/pkg/util/indexparamcheck"
"github.com/milvus-io/milvus/pkg/util/indexparams"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/metautil"
"github.com/milvus-io/milvus/pkg/util/paramtable"
@ -1266,18 +1270,58 @@ func (s *LocalSegment) LoadIndex(ctx context.Context, indexInfo *querypb.FieldIn
return err
}
defer deleteLoadIndexInfo(loadIndexInfo)
schema, err := typeutil.CreateSchemaHelper(s.GetCollection().Schema())
if err != nil {
return err
}
fieldSchema, err := schema.GetFieldFromID(indexInfo.GetFieldID())
if err != nil {
return err
}
indexParams := funcutil.KeyValuePair2Map(indexInfo.IndexParams)
// as Knowhere reports error if encounter an unknown param, we need to delete it
delete(indexParams, common.MmapEnabledKey)
// some build params also exist in indexParams, which are useless during loading process
if indexParams["index_type"] == indexparamcheck.IndexDISKANN {
if err := indexparams.SetDiskIndexLoadParams(paramtable.Get(), indexParams, indexInfo.GetNumRows()); err != nil {
return err
}
}
if err := indexparams.AppendPrepareLoadParams(paramtable.Get(), indexParams); err != nil {
return err
}
indexInfoProto := &cgopb.LoadIndexInfo{
CollectionID: s.Collection(),
PartitionID: s.Partition(),
SegmentID: s.ID(),
Field: fieldSchema,
EnableMmap: isIndexMmapEnable(indexInfo),
MmapDirPath: paramtable.Get().QueryNodeCfg.MmapDirPath.GetValue(),
IndexID: indexInfo.GetIndexID(),
IndexBuildID: indexInfo.GetBuildID(),
IndexVersion: indexInfo.GetIndexVersion(),
IndexParams: indexParams,
IndexFiles: indexInfo.GetIndexFilePaths(),
IndexEngineVersion: indexInfo.GetCurrentIndexVersion(),
IndexStoreVersion: indexInfo.GetIndexStoreVersion(),
}
if paramtable.Get().CommonCfg.EnableStorageV2.GetAsBool() {
uri, err := typeutil_internal.GetStorageURI(paramtable.Get().CommonCfg.StorageScheme.GetValue(), paramtable.Get().CommonCfg.StoragePathPrefix.GetValue(), s.ID())
if err != nil {
return err
}
loadIndexInfo.appendStorageInfo(uri, indexInfo.IndexStoreVersion)
indexInfoProto.Uri = uri
}
newLoadIndexInfoSpan := tr.RecordSpan()
// 2.
err = loadIndexInfo.appendLoadIndexInfo(ctx, indexInfo, s.Collection(), s.Partition(), s.ID(), fieldType)
if err != nil {
if err := loadIndexInfo.finish(ctx, indexInfoProto); err != nil {
if loadIndexInfo.cleanLocalData(ctx) != nil {
log.Warn("failed to clean cached data on disk after append index failed",
zap.Int64("buildID", indexInfo.BuildID),

View File

@ -17,7 +17,8 @@ func (c *INVERTEDChecker) CheckTrain(params map[string]string) error {
}
func (c *INVERTEDChecker) CheckValidDataType(dType schemapb.DataType) error {
if !typeutil.IsBoolType(dType) && !typeutil.IsArithmetic(dType) && !typeutil.IsStringType(dType) {
if !typeutil.IsBoolType(dType) && !typeutil.IsArithmetic(dType) && !typeutil.IsStringType(dType) &&
!typeutil.IsArrayType(dType) {
return fmt.Errorf("INVERTED are not supported on %s field", dType.String())
}
return nil

View File

@ -18,8 +18,8 @@ func Test_INVERTEDIndexChecker(t *testing.T) {
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Bool))
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Int64))
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Float))
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Array))
assert.Error(t, c.CheckValidDataType(schemapb.DataType_JSON))
assert.Error(t, c.CheckValidDataType(schemapb.DataType_Array))
assert.Error(t, c.CheckValidDataType(schemapb.DataType_FloatVector))
}

View File

@ -44,6 +44,7 @@ pushd ${PROTO_DIR}
mkdir -p etcdpb
mkdir -p indexcgopb
mkdir -p cgopb
mkdir -p internalpb
mkdir -p rootcoordpb
@ -62,6 +63,7 @@ protoc_opt="${PROTOC_BIN} --proto_path=${API_PROTO_DIR} --proto_path=."
${protoc_opt} --go_out=plugins=grpc,paths=source_relative:./etcdpb etcd_meta.proto || { echo 'generate etcd_meta.proto failed'; exit 1; }
${protoc_opt} --go_out=plugins=grpc,paths=source_relative:./indexcgopb index_cgo_msg.proto || { echo 'generate index_cgo_msg failed '; exit 1; }
${protoc_opt} --go_out=plugins=grpc,paths=source_relative:./cgopb cgo_msg.proto || { echo 'generate cgo_msg failed '; exit 1; }
${protoc_opt} --go_out=plugins=grpc,paths=source_relative:./rootcoordpb root_coord.proto || { echo 'generate root_coord.proto failed'; exit 1; }
${protoc_opt} --go_out=plugins=grpc,paths=source_relative:./internalpb internal.proto || { echo 'generate internal.proto failed'; exit 1; }
${protoc_opt} --go_out=plugins=grpc,paths=source_relative:./proxypb proxy.proto|| { echo 'generate proxy.proto failed'; exit 1; }
@ -78,6 +80,7 @@ ${protoc_opt} --cpp_out=$CPP_SRC_DIR/src/pb schema.proto|| { echo 'generate sche
${protoc_opt} --cpp_out=$CPP_SRC_DIR/src/pb common.proto|| { echo 'generate common.proto failed'; exit 1; }
${protoc_opt} --cpp_out=$CPP_SRC_DIR/src/pb segcore.proto|| { echo 'generate segcore.proto failed'; exit 1; }
${protoc_opt} --cpp_out=$CPP_SRC_DIR/src/pb index_cgo_msg.proto|| { echo 'generate index_cgo_msg.proto failed'; exit 1; }
${protoc_opt} --cpp_out=$CPP_SRC_DIR/src/pb cgo_msg.proto|| { echo 'generate cgo_msg.proto failed'; exit 1; }
${protoc_opt} --cpp_out=$CPP_SRC_DIR/src/pb plan.proto|| { echo 'generate plan.proto failed'; exit 1; }
popd

View File

@ -1309,10 +1309,7 @@ class TestIndexInvalid(TestcaseBase):
collection_w = self.init_collection_wrap(schema=schema)
# 2. create index
scalar_index_params = {"index_type": "INVERTED"}
collection_w.create_index(ct.default_int32_array_field_name, index_params=scalar_index_params,
check_task=CheckTasks.err_res,
check_items={ct.err_code: 1100,
ct.err_msg: "create index on Array field is not supported"})
collection_w.create_index(ct.default_int32_array_field_name, index_params=scalar_index_params)
@pytest.mark.tags(CaseLabel.L1)
def test_create_inverted_index_no_vector_index(self):