mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
issue: https://github.com/milvus-io/milvus/issues/35528 This PR introduces a JSON flat index that allows indexing JSON fields and dynamic fields in the same way as other field types. In a previous PR (#36750), we implemented a JSON index that requires specifying a JSON path and casting a type. The only distinction lies in the json_cast_type parameter. When json_cast_type is set to JSON type, Milvus automatically creates a JSON flat index. For details on how Tantivy interprets JSON data, refer to the [tantivy documentation](https://github.com/quickwit-oss/tantivy/blob/main/doc/src/json.md#pitfalls-limitation-and-corner-cases). Limitations Array handling: Arrays do not function as nested objects. See the [limitations section](https://github.com/quickwit-oss/tantivy/blob/main/doc/src/json.md#arrays-do-not-work-like-nested-object) for more details. --------- Signed-off-by: sunby <sunbingyi1992@gmail.com>
547 lines
22 KiB
C++
547 lines
22 KiB
C++
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "index/IndexFactory.h"
|
|
#include <cstdlib>
|
|
#include <memory>
|
|
#include <string>
|
|
#include "common/EasyAssert.h"
|
|
#include "common/FieldDataInterface.h"
|
|
#include "common/JsonCastType.h"
|
|
#include "common/Types.h"
|
|
#include "index/Index.h"
|
|
#include "index/JsonFlatIndex.h"
|
|
#include "index/VectorMemIndex.h"
|
|
#include "index/Utils.h"
|
|
#include "index/Meta.h"
|
|
#include "index/JsonInvertedIndex.h"
|
|
#include "knowhere/utils.h"
|
|
|
|
#include "index/VectorDiskIndex.h"
|
|
#include "index/ScalarIndexSort.h"
|
|
#include "index/StringIndexMarisa.h"
|
|
#include "index/BoolIndex.h"
|
|
#include "index/InvertedIndexTantivy.h"
|
|
#include "index/HybridScalarIndex.h"
|
|
#include "knowhere/comp/knowhere_check.h"
|
|
#include "log/Log.h"
|
|
#include "pb/schema.pb.h"
|
|
|
|
namespace milvus::index {
|
|
|
|
template <typename T>
|
|
ScalarIndexPtr<T>
|
|
IndexFactory::CreatePrimitiveScalarIndex(
|
|
const CreateIndexInfo& create_index_info,
|
|
const storage::FileManagerContext& file_manager_context) {
|
|
auto index_type = create_index_info.index_type;
|
|
if (index_type == INVERTED_INDEX_TYPE) {
|
|
assert(create_index_info.tantivy_index_version != 0);
|
|
// scalar_index_engine_version 0 means we should built tantivy index within single segment
|
|
return std::make_unique<InvertedIndexTantivy<T>>(
|
|
create_index_info.tantivy_index_version,
|
|
file_manager_context,
|
|
create_index_info.scalar_index_engine_version == 0);
|
|
}
|
|
if (index_type == BITMAP_INDEX_TYPE) {
|
|
return std::make_unique<BitmapIndex<T>>(file_manager_context);
|
|
}
|
|
if (index_type == HYBRID_INDEX_TYPE) {
|
|
return std::make_unique<HybridScalarIndex<T>>(
|
|
create_index_info.tantivy_index_version, file_manager_context);
|
|
}
|
|
return CreateScalarIndexSort<T>(file_manager_context);
|
|
}
|
|
|
|
// template <>
|
|
// inline ScalarIndexPtr<bool>
|
|
// IndexFactory::CreateScalarIndex(const IndexType& index_type) {
|
|
// return CreateBoolIndex();
|
|
//}
|
|
//
|
|
|
|
template <>
|
|
ScalarIndexPtr<std::string>
|
|
IndexFactory::CreatePrimitiveScalarIndex<std::string>(
|
|
const CreateIndexInfo& create_index_info,
|
|
const storage::FileManagerContext& file_manager_context) {
|
|
auto index_type = create_index_info.index_type;
|
|
#if defined(__linux__) || defined(__APPLE__)
|
|
if (index_type == INVERTED_INDEX_TYPE) {
|
|
assert(create_index_info.tantivy_index_version != 0);
|
|
// scalar_index_engine_version 0 means we should built tantivy index within single segment
|
|
return std::make_unique<InvertedIndexTantivy<std::string>>(
|
|
create_index_info.tantivy_index_version,
|
|
file_manager_context,
|
|
create_index_info.scalar_index_engine_version == 0);
|
|
}
|
|
if (index_type == BITMAP_INDEX_TYPE) {
|
|
return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
|
|
}
|
|
if (index_type == HYBRID_INDEX_TYPE) {
|
|
return std::make_unique<HybridScalarIndex<std::string>>(
|
|
create_index_info.tantivy_index_version, file_manager_context);
|
|
}
|
|
return CreateStringIndexMarisa(file_manager_context);
|
|
#else
|
|
PanicInfo(Unsupported, "unsupported platform");
|
|
#endif
|
|
}
|
|
|
|
LoadResourceRequest
|
|
IndexFactory::IndexLoadResource(
|
|
DataType field_type,
|
|
IndexVersion index_version,
|
|
float index_size,
|
|
const std::map<std::string, std::string>& index_params,
|
|
bool mmap_enable) {
|
|
if (milvus::IsVectorDataType(field_type)) {
|
|
return VecIndexLoadResource(
|
|
field_type, index_version, index_size, index_params, mmap_enable);
|
|
} else {
|
|
return ScalarIndexLoadResource(
|
|
field_type, index_version, index_size, index_params, mmap_enable);
|
|
}
|
|
}
|
|
|
|
LoadResourceRequest
|
|
IndexFactory::VecIndexLoadResource(
|
|
DataType field_type,
|
|
IndexVersion index_version,
|
|
float index_size,
|
|
const std::map<std::string, std::string>& index_params,
|
|
bool mmap_enable) {
|
|
auto config = milvus::index::ParseConfigFromIndexParams(index_params);
|
|
|
|
AssertInfo(index_params.find("index_type") != index_params.end(),
|
|
"index type is empty");
|
|
std::string index_type = index_params.at("index_type");
|
|
|
|
bool mmaped = false;
|
|
if (mmap_enable &&
|
|
knowhere::KnowhereCheck::SupportMmapIndexTypeCheck(index_type)) {
|
|
config["enable_mmap"] = true;
|
|
mmaped = true;
|
|
}
|
|
|
|
knowhere::expected<knowhere::Resource> resource;
|
|
float index_size_gb = index_size * 1.0 / 1024.0 / 1024.0 / 1024.0;
|
|
float download_buffer_size_gb =
|
|
DEFAULT_FIELD_MAX_MEMORY_LIMIT * 1.0 / 1024.0 / 1024.0 / 1024.0;
|
|
|
|
bool has_raw_data = false;
|
|
switch (field_type) {
|
|
case milvus::DataType::VECTOR_BINARY:
|
|
resource = knowhere::IndexStaticFaced<
|
|
knowhere::bin1>::EstimateLoadResource(index_type,
|
|
index_version,
|
|
index_size_gb,
|
|
config);
|
|
has_raw_data =
|
|
knowhere::IndexStaticFaced<knowhere::bin1>::HasRawData(
|
|
index_type, index_version, config);
|
|
break;
|
|
case milvus::DataType::VECTOR_FLOAT:
|
|
resource = knowhere::IndexStaticFaced<
|
|
knowhere::fp32>::EstimateLoadResource(index_type,
|
|
index_version,
|
|
index_size_gb,
|
|
config);
|
|
has_raw_data =
|
|
knowhere::IndexStaticFaced<knowhere::fp32>::HasRawData(
|
|
index_type, index_version, config);
|
|
break;
|
|
case milvus::DataType::VECTOR_FLOAT16:
|
|
resource = knowhere::IndexStaticFaced<
|
|
knowhere::fp16>::EstimateLoadResource(index_type,
|
|
index_version,
|
|
index_size_gb,
|
|
config);
|
|
has_raw_data =
|
|
knowhere::IndexStaticFaced<knowhere::fp16>::HasRawData(
|
|
index_type, index_version, config);
|
|
break;
|
|
case milvus::DataType::VECTOR_BFLOAT16:
|
|
resource = knowhere::IndexStaticFaced<
|
|
knowhere::bf16>::EstimateLoadResource(index_type,
|
|
index_version,
|
|
index_size_gb,
|
|
config);
|
|
has_raw_data =
|
|
knowhere::IndexStaticFaced<knowhere::bf16>::HasRawData(
|
|
index_type, index_version, config);
|
|
break;
|
|
case milvus::DataType::VECTOR_SPARSE_FLOAT:
|
|
resource = knowhere::IndexStaticFaced<
|
|
knowhere::fp32>::EstimateLoadResource(index_type,
|
|
index_version,
|
|
index_size_gb,
|
|
config);
|
|
has_raw_data =
|
|
knowhere::IndexStaticFaced<knowhere::fp32>::HasRawData(
|
|
index_type, index_version, config);
|
|
break;
|
|
case milvus::DataType::VECTOR_INT8:
|
|
resource = knowhere::IndexStaticFaced<
|
|
knowhere::int8>::EstimateLoadResource(index_type,
|
|
index_version,
|
|
index_size_gb,
|
|
config);
|
|
has_raw_data =
|
|
knowhere::IndexStaticFaced<knowhere::int8>::HasRawData(
|
|
index_type, index_version, config);
|
|
break;
|
|
default:
|
|
LOG_ERROR("invalid data type to estimate index load resource: {}",
|
|
field_type);
|
|
return LoadResourceRequest{0, 0, 0, 0, true};
|
|
}
|
|
|
|
LoadResourceRequest request{};
|
|
|
|
request.has_raw_data = has_raw_data;
|
|
request.final_disk_cost = resource.value().diskCost;
|
|
request.final_memory_cost = resource.value().memoryCost;
|
|
if (knowhere::UseDiskLoad(index_type, index_version) || mmaped) {
|
|
request.max_disk_cost = resource.value().diskCost;
|
|
request.max_memory_cost =
|
|
std::max(resource.value().memoryCost, download_buffer_size_gb);
|
|
} else {
|
|
request.max_disk_cost = 0;
|
|
request.max_memory_cost = 2 * resource.value().memoryCost;
|
|
}
|
|
return request;
|
|
}
|
|
|
|
LoadResourceRequest
|
|
IndexFactory::ScalarIndexLoadResource(
|
|
DataType field_type,
|
|
IndexVersion index_version,
|
|
float index_size,
|
|
const std::map<std::string, std::string>& index_params,
|
|
bool mmap_enable) {
|
|
auto config = milvus::index::ParseConfigFromIndexParams(index_params);
|
|
|
|
AssertInfo(index_params.find("index_type") != index_params.end(),
|
|
"index type is empty");
|
|
std::string index_type = index_params.at("index_type");
|
|
|
|
knowhere::expected<knowhere::Resource> resource;
|
|
float index_size_gb = index_size * 1.0 / 1024.0 / 1024.0 / 1024.0;
|
|
|
|
LoadResourceRequest request{};
|
|
request.has_raw_data = false;
|
|
|
|
if (index_type == milvus::index::ASCENDING_SORT) {
|
|
request.final_memory_cost = index_size_gb;
|
|
request.final_disk_cost = 0;
|
|
request.max_memory_cost = 2 * index_size_gb;
|
|
request.max_disk_cost = 0;
|
|
request.has_raw_data = true;
|
|
} else if (index_type == milvus::index::MARISA_TRIE ||
|
|
index_type == milvus::index::MARISA_TRIE_UPPER) {
|
|
if (mmap_enable) {
|
|
request.final_memory_cost = 0;
|
|
request.final_disk_cost = index_size_gb;
|
|
request.max_memory_cost = index_size_gb;
|
|
request.max_disk_cost = index_size_gb;
|
|
} else {
|
|
request.final_memory_cost = index_size_gb;
|
|
request.final_disk_cost = 0;
|
|
request.max_memory_cost = 2 * index_size_gb;
|
|
request.max_disk_cost = 0;
|
|
}
|
|
request.has_raw_data = true;
|
|
} else if (index_type == milvus::index::INVERTED_INDEX_TYPE) {
|
|
request.final_memory_cost = 0;
|
|
request.final_disk_cost = index_size_gb;
|
|
request.max_memory_cost = index_size_gb;
|
|
request.max_disk_cost = index_size_gb;
|
|
|
|
request.has_raw_data = false;
|
|
} else if (index_type == milvus::index::BITMAP_INDEX_TYPE) {
|
|
if (mmap_enable) {
|
|
request.final_memory_cost = 0;
|
|
request.final_disk_cost = index_size_gb;
|
|
request.max_memory_cost = index_size_gb;
|
|
request.max_disk_cost = index_size_gb;
|
|
} else {
|
|
request.final_memory_cost = index_size_gb;
|
|
request.final_disk_cost = 0;
|
|
request.max_memory_cost = 2 * index_size_gb;
|
|
request.max_disk_cost = 0;
|
|
}
|
|
|
|
request.has_raw_data = false;
|
|
} else if (index_type == milvus::index::HYBRID_INDEX_TYPE) {
|
|
request.final_memory_cost = index_size_gb;
|
|
request.final_disk_cost = index_size_gb;
|
|
request.max_memory_cost = 2 * index_size_gb;
|
|
request.max_disk_cost = index_size_gb;
|
|
request.has_raw_data = false;
|
|
} else {
|
|
LOG_ERROR(
|
|
"invalid index type to estimate scalar index load resource: {}",
|
|
index_type);
|
|
return LoadResourceRequest{0, 0, 0, 0, false};
|
|
}
|
|
return request;
|
|
}
|
|
|
|
IndexBasePtr
|
|
IndexFactory::CreateIndex(
|
|
const CreateIndexInfo& create_index_info,
|
|
const storage::FileManagerContext& file_manager_context,
|
|
bool use_build_pool) {
|
|
if (IsVectorDataType(create_index_info.field_type)) {
|
|
return CreateVectorIndex(
|
|
create_index_info, file_manager_context, use_build_pool);
|
|
}
|
|
|
|
return CreateScalarIndex(create_index_info, file_manager_context);
|
|
}
|
|
|
|
IndexBasePtr
|
|
IndexFactory::CreatePrimitiveScalarIndex(
|
|
DataType data_type,
|
|
const CreateIndexInfo& create_index_info,
|
|
const storage::FileManagerContext& file_manager_context) {
|
|
switch (data_type) {
|
|
// create scalar index
|
|
case DataType::BOOL:
|
|
return CreatePrimitiveScalarIndex<bool>(create_index_info,
|
|
file_manager_context);
|
|
case DataType::INT8:
|
|
return CreatePrimitiveScalarIndex<int8_t>(create_index_info,
|
|
file_manager_context);
|
|
case DataType::INT16:
|
|
return CreatePrimitiveScalarIndex<int16_t>(create_index_info,
|
|
file_manager_context);
|
|
case DataType::INT32:
|
|
return CreatePrimitiveScalarIndex<int32_t>(create_index_info,
|
|
file_manager_context);
|
|
case DataType::INT64:
|
|
return CreatePrimitiveScalarIndex<int64_t>(create_index_info,
|
|
file_manager_context);
|
|
case DataType::FLOAT:
|
|
return CreatePrimitiveScalarIndex<float>(create_index_info,
|
|
file_manager_context);
|
|
case DataType::DOUBLE:
|
|
return CreatePrimitiveScalarIndex<double>(create_index_info,
|
|
file_manager_context);
|
|
|
|
// create string index
|
|
case DataType::STRING:
|
|
case DataType::VARCHAR:
|
|
return CreatePrimitiveScalarIndex<std::string>(
|
|
create_index_info, file_manager_context);
|
|
default:
|
|
PanicInfo(
|
|
DataTypeInvalid,
|
|
fmt::format("invalid data type to build index: {}", data_type));
|
|
}
|
|
}
|
|
|
|
IndexBasePtr
|
|
IndexFactory::CreateCompositeScalarIndex(
|
|
const CreateIndexInfo& create_index_info,
|
|
const storage::FileManagerContext& file_manager_context) {
|
|
auto index_type = create_index_info.index_type;
|
|
if (index_type == HYBRID_INDEX_TYPE || index_type == BITMAP_INDEX_TYPE ||
|
|
index_type == INVERTED_INDEX_TYPE) {
|
|
auto element_type = static_cast<DataType>(
|
|
file_manager_context.fieldDataMeta.field_schema.element_type());
|
|
return CreatePrimitiveScalarIndex(
|
|
element_type, create_index_info, file_manager_context);
|
|
} else {
|
|
PanicInfo(
|
|
Unsupported,
|
|
fmt::format("index type: {} for composite scalar not supported now",
|
|
index_type));
|
|
}
|
|
}
|
|
|
|
IndexBasePtr
|
|
IndexFactory::CreateComplexScalarIndex(
|
|
IndexType index_type,
|
|
const storage::FileManagerContext& file_manager_context) {
|
|
PanicInfo(Unsupported, "Complex index not supported now");
|
|
}
|
|
|
|
IndexBasePtr
|
|
IndexFactory::CreateJsonIndex(
|
|
IndexType index_type,
|
|
JsonCastType cast_dtype,
|
|
const std::string& nested_path,
|
|
const storage::FileManagerContext& file_manager_context,
|
|
const std::string& json_cast_function) {
|
|
AssertInfo(index_type == INVERTED_INDEX_TYPE,
|
|
"Invalid index type for json index");
|
|
|
|
switch (cast_dtype.element_type()) {
|
|
case JsonCastType::DataType::BOOL:
|
|
return std::make_unique<index::JsonInvertedIndex<bool>>(
|
|
cast_dtype,
|
|
nested_path,
|
|
file_manager_context,
|
|
JsonCastFunction::FromString(json_cast_function));
|
|
case JsonCastType::DataType::DOUBLE:
|
|
return std::make_unique<index::JsonInvertedIndex<double>>(
|
|
cast_dtype,
|
|
nested_path,
|
|
file_manager_context,
|
|
JsonCastFunction::FromString(json_cast_function));
|
|
case JsonCastType::DataType::VARCHAR:
|
|
return std::make_unique<index::JsonInvertedIndex<std::string>>(
|
|
cast_dtype,
|
|
nested_path,
|
|
file_manager_context,
|
|
JsonCastFunction::FromString(json_cast_function));
|
|
case JsonCastType::DataType::JSON:
|
|
return std::make_unique<JsonFlatIndex>(file_manager_context,
|
|
nested_path);
|
|
default:
|
|
PanicInfo(DataTypeInvalid, "Invalid data type:{}", cast_dtype);
|
|
}
|
|
}
|
|
|
|
IndexBasePtr
|
|
IndexFactory::CreateScalarIndex(
|
|
const CreateIndexInfo& create_index_info,
|
|
const storage::FileManagerContext& file_manager_context) {
|
|
auto data_type = create_index_info.field_type;
|
|
switch (data_type) {
|
|
case DataType::BOOL:
|
|
case DataType::INT8:
|
|
case DataType::INT16:
|
|
case DataType::INT32:
|
|
case DataType::INT64:
|
|
case DataType::FLOAT:
|
|
case DataType::DOUBLE:
|
|
case DataType::VARCHAR:
|
|
case DataType::STRING:
|
|
return CreatePrimitiveScalarIndex(
|
|
data_type, create_index_info, file_manager_context);
|
|
case DataType::ARRAY: {
|
|
return CreateCompositeScalarIndex(create_index_info,
|
|
file_manager_context);
|
|
}
|
|
case DataType::JSON: {
|
|
return CreateJsonIndex(create_index_info.index_type,
|
|
create_index_info.json_cast_type,
|
|
create_index_info.json_path,
|
|
file_manager_context,
|
|
create_index_info.json_cast_function);
|
|
}
|
|
default:
|
|
PanicInfo(DataTypeInvalid, "Invalid data type:{}", data_type);
|
|
}
|
|
}
|
|
|
|
IndexBasePtr
|
|
IndexFactory::CreateVectorIndex(
|
|
const CreateIndexInfo& create_index_info,
|
|
const storage::FileManagerContext& file_manager_context,
|
|
bool use_knowhere_build_pool) {
|
|
auto index_type = create_index_info.index_type;
|
|
auto metric_type = create_index_info.metric_type;
|
|
auto version = create_index_info.index_engine_version;
|
|
// create disk index
|
|
auto data_type = create_index_info.field_type;
|
|
if (knowhere::UseDiskLoad(index_type, version)) {
|
|
switch (data_type) {
|
|
case DataType::VECTOR_FLOAT: {
|
|
return std::make_unique<VectorDiskAnnIndex<float>>(
|
|
index_type, metric_type, version, file_manager_context);
|
|
}
|
|
case DataType::VECTOR_FLOAT16: {
|
|
return std::make_unique<VectorDiskAnnIndex<float16>>(
|
|
index_type, metric_type, version, file_manager_context);
|
|
}
|
|
case DataType::VECTOR_BFLOAT16: {
|
|
return std::make_unique<VectorDiskAnnIndex<bfloat16>>(
|
|
index_type, metric_type, version, file_manager_context);
|
|
}
|
|
case DataType::VECTOR_BINARY: {
|
|
return std::make_unique<VectorDiskAnnIndex<bin1>>(
|
|
index_type, metric_type, version, file_manager_context);
|
|
}
|
|
case DataType::VECTOR_SPARSE_FLOAT: {
|
|
return std::make_unique<VectorDiskAnnIndex<float>>(
|
|
index_type, metric_type, version, file_manager_context);
|
|
}
|
|
case DataType::VECTOR_INT8: {
|
|
// TODO caiyd, not support yet
|
|
}
|
|
default:
|
|
PanicInfo(
|
|
DataTypeInvalid,
|
|
fmt::format("invalid data type to build disk index: {}",
|
|
data_type));
|
|
}
|
|
} else { // create mem index
|
|
switch (data_type) {
|
|
case DataType::VECTOR_FLOAT:
|
|
case DataType::VECTOR_SPARSE_FLOAT: {
|
|
return std::make_unique<VectorMemIndex<float>>(
|
|
index_type,
|
|
metric_type,
|
|
version,
|
|
use_knowhere_build_pool,
|
|
file_manager_context);
|
|
}
|
|
case DataType::VECTOR_BINARY: {
|
|
return std::make_unique<VectorMemIndex<bin1>>(
|
|
index_type,
|
|
metric_type,
|
|
version,
|
|
use_knowhere_build_pool,
|
|
file_manager_context);
|
|
}
|
|
case DataType::VECTOR_FLOAT16: {
|
|
return std::make_unique<VectorMemIndex<float16>>(
|
|
index_type,
|
|
metric_type,
|
|
version,
|
|
use_knowhere_build_pool,
|
|
file_manager_context);
|
|
}
|
|
case DataType::VECTOR_BFLOAT16: {
|
|
return std::make_unique<VectorMemIndex<bfloat16>>(
|
|
index_type,
|
|
metric_type,
|
|
version,
|
|
use_knowhere_build_pool,
|
|
file_manager_context);
|
|
}
|
|
case DataType::VECTOR_INT8: {
|
|
return std::make_unique<VectorMemIndex<int8>>(
|
|
index_type,
|
|
metric_type,
|
|
version,
|
|
use_knowhere_build_pool,
|
|
file_manager_context);
|
|
}
|
|
default:
|
|
PanicInfo(
|
|
DataTypeInvalid,
|
|
fmt::format("invalid data type to build mem index: {}",
|
|
data_type));
|
|
}
|
|
}
|
|
}
|
|
} // namespace milvus::index
|