mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
fix: remove std::string support for stlsort index (#43355)
fix: https://github.com/milvus-io/milvus/issues/43354 The current implementation of stdsort index is not supported for std::string. Remove the code. Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
parent
5d90b65342
commit
d750816ba0
@ -415,5 +415,4 @@ template class ScalarIndexSort<int32_t>;
|
||||
template class ScalarIndexSort<int64_t>;
|
||||
template class ScalarIndexSort<float>;
|
||||
template class ScalarIndexSort<double>;
|
||||
template class ScalarIndexSort<std::string>;
|
||||
} // namespace milvus::index
|
||||
|
||||
@ -31,6 +31,9 @@ namespace milvus::index {
|
||||
|
||||
template <typename T>
|
||||
class ScalarIndexSort : public ScalarIndex<T> {
|
||||
static_assert(std::is_arithmetic_v<T>,
|
||||
"ScalarIndexSort only supports arithmetic types");
|
||||
|
||||
public:
|
||||
explicit ScalarIndexSort(
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
|
||||
@ -1,65 +0,0 @@
|
||||
// Licensed to the LF AI & Data foundation under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "common/Utils.h"
|
||||
#include "index/ScalarIndexSort.h"
|
||||
#include "index/StringIndex.h"
|
||||
|
||||
namespace milvus::index {
|
||||
// TODO: should inherit from StringIndex?
|
||||
class StringIndexSort : public ScalarIndexSort<std::string> {
|
||||
public:
|
||||
const TargetBitmap
|
||||
Query(const DatasetPtr& dataset) override {
|
||||
auto op = dataset->Get<OpType>(OPERATOR_TYPE);
|
||||
if (op == OpType::PrefixMatch) {
|
||||
auto prefix = dataset->Get<std::string>(MATCH_VALUE);
|
||||
return PrefixMatch(prefix);
|
||||
}
|
||||
return ScalarIndex<std::string>::Query(dataset);
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
PrefixMatch(std::string_view prefix) {
|
||||
auto data = GetData();
|
||||
TargetBitmap bitset(data.size());
|
||||
auto it = std::lower_bound(
|
||||
data.begin(),
|
||||
data.end(),
|
||||
prefix,
|
||||
[](const IndexStructure<std::string>& value,
|
||||
std::string_view prefix) { return value.a_ < prefix; });
|
||||
for (; it != data.end(); ++it) {
|
||||
if (!milvus::PrefixMatch(it->a_, prefix)) {
|
||||
break;
|
||||
}
|
||||
bitset[it->idx_] = true;
|
||||
}
|
||||
return bitset;
|
||||
}
|
||||
};
|
||||
using StringIndexSortPtr = std::unique_ptr<StringIndexSort>;
|
||||
|
||||
inline StringIndexSortPtr
|
||||
CreateStringIndexSort() {
|
||||
return std::make_unique<StringIndexSort>();
|
||||
}
|
||||
} // namespace milvus::index
|
||||
@ -14,7 +14,6 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "index/ScalarIndexSort.h"
|
||||
#include "index/StringIndexSort.h"
|
||||
|
||||
#include "common/FieldMeta.h"
|
||||
#include "common/Span.h"
|
||||
@ -30,14 +29,6 @@ generate_scalar_index(Span<T> data) {
|
||||
return indexing;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline index::ScalarIndexPtr<std::string>
|
||||
generate_scalar_index(Span<std::string> data) {
|
||||
auto indexing = index::CreateStringIndexSort();
|
||||
indexing->Build(data.row_count(), data.data(), data.valid_data());
|
||||
return indexing;
|
||||
}
|
||||
|
||||
inline index::IndexBasePtr
|
||||
generate_scalar_index(SpanBase data, DataType data_type) {
|
||||
Assert(!IsVectorDataType(data_type));
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
#include "common/EasyAssert.h"
|
||||
#include "fmt/format.h"
|
||||
#include "index/ScalarIndexSort.h"
|
||||
#include "index/StringIndexSort.h"
|
||||
|
||||
#include "common/SystemProperty.h"
|
||||
#include "segcore/FieldIndexing.h"
|
||||
@ -101,50 +100,6 @@ VectorFieldIndexing::recreate_index(DataType data_type,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
VectorFieldIndexing::BuildIndexRange(int64_t ack_beg,
|
||||
int64_t ack_end,
|
||||
const VectorBase* vec_base) {
|
||||
// No BuildIndexRange support for sparse vector.
|
||||
AssertInfo(field_meta_.get_data_type() == DataType::VECTOR_FLOAT ||
|
||||
field_meta_.get_data_type() == DataType::VECTOR_FLOAT16 ||
|
||||
field_meta_.get_data_type() == DataType::VECTOR_BFLOAT16,
|
||||
"Data type of vector field is not in (VECTOR_FLOAT, "
|
||||
"VECTOR_FLOAT16,VECTOR_BFLOAT16)");
|
||||
auto dim = field_meta_.get_dim();
|
||||
AssertInfo(
|
||||
ConcurrentDenseVectorCheck(vec_base, field_meta_.get_data_type()),
|
||||
"vec_base can't cast to ConcurrentVector type");
|
||||
auto num_chunk = vec_base->num_chunk();
|
||||
AssertInfo(ack_end <= num_chunk, "ack_end is bigger than num_chunk");
|
||||
auto conf = get_build_params(field_meta_.get_data_type());
|
||||
data_.grow_to_at_least(ack_end);
|
||||
for (int chunk_id = ack_beg; chunk_id < ack_end; chunk_id++) {
|
||||
const auto& chunk_data = vec_base->get_chunk_data(chunk_id);
|
||||
std::unique_ptr<index::VectorIndex> indexing = nullptr;
|
||||
if (field_meta_.get_data_type() == DataType::VECTOR_FLOAT) {
|
||||
indexing = std::make_unique<index::VectorMemIndex<float>>(
|
||||
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
|
||||
knowhere::metric::L2,
|
||||
knowhere::Version::GetCurrentVersion().VersionNumber());
|
||||
} else if (field_meta_.get_data_type() == DataType::VECTOR_FLOAT16) {
|
||||
indexing = std::make_unique<index::VectorMemIndex<float16>>(
|
||||
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
|
||||
knowhere::metric::L2,
|
||||
knowhere::Version::GetCurrentVersion().VersionNumber());
|
||||
} else {
|
||||
indexing = std::make_unique<index::VectorMemIndex<bfloat16>>(
|
||||
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
|
||||
knowhere::metric::L2,
|
||||
knowhere::Version::GetCurrentVersion().VersionNumber());
|
||||
}
|
||||
auto dataset = knowhere::GenDataSet(
|
||||
vec_base->get_size_per_chunk(), dim, chunk_data);
|
||||
indexing->BuildWithDataset(dataset, conf);
|
||||
data_[chunk_id] = std::move(indexing);
|
||||
}
|
||||
}
|
||||
|
||||
// for sparse float vector:
|
||||
// * element_size is not used
|
||||
// * output_raw pooints at a milvus::schema::proto::SparseFloatArray.
|
||||
@ -366,35 +321,6 @@ VectorFieldIndexing::has_raw_data() const {
|
||||
return index_->HasRawData();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
ScalarFieldIndexing<T>::BuildIndexRange(int64_t ack_beg,
|
||||
int64_t ack_end,
|
||||
const VectorBase* vec_base) {
|
||||
auto source = dynamic_cast<const ConcurrentVector<T>*>(vec_base);
|
||||
AssertInfo(source, "vec_base can't cast to ConcurrentVector type");
|
||||
auto num_chunk = source->num_chunk();
|
||||
AssertInfo(ack_end <= num_chunk, "Ack_end is bigger than num_chunk");
|
||||
data_.grow_to_at_least(ack_end);
|
||||
for (int chunk_id = ack_beg; chunk_id < ack_end; chunk_id++) {
|
||||
auto chunk_data = source->get_chunk_data(chunk_id);
|
||||
// build index for chunk
|
||||
// seem no lint, not pass valid_data here
|
||||
// TODO
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
auto indexing = index::CreateStringIndexSort();
|
||||
indexing->Build(vec_base->get_size_per_chunk(),
|
||||
static_cast<const T*>(chunk_data));
|
||||
data_[chunk_id] = std::move(indexing);
|
||||
} else {
|
||||
auto indexing = index::CreateScalarIndexSort<T>();
|
||||
indexing->Build(vec_base->get_size_per_chunk(),
|
||||
static_cast<const T*>(chunk_data));
|
||||
data_[chunk_id] = std::move(indexing);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<FieldIndexing>
|
||||
CreateIndex(const FieldMeta& field_meta,
|
||||
const FieldIndexMeta& field_index_meta,
|
||||
|
||||
@ -47,12 +47,6 @@ class FieldIndexing {
|
||||
operator=(const FieldIndexing&) = delete;
|
||||
virtual ~FieldIndexing() = default;
|
||||
|
||||
// Do this in parallel
|
||||
virtual void
|
||||
BuildIndexRange(int64_t ack_beg,
|
||||
int64_t ack_end,
|
||||
const VectorBase* vec_base) = 0;
|
||||
|
||||
virtual void
|
||||
AppendSegmentIndexDense(int64_t reserved_offset,
|
||||
int64_t size,
|
||||
@ -111,11 +105,6 @@ class ScalarFieldIndexing : public FieldIndexing {
|
||||
public:
|
||||
using FieldIndexing::FieldIndexing;
|
||||
|
||||
void
|
||||
BuildIndexRange(int64_t ack_beg,
|
||||
int64_t ack_end,
|
||||
const VectorBase* vec_base) override;
|
||||
|
||||
void
|
||||
AppendSegmentIndexDense(int64_t reserved_offset,
|
||||
int64_t size,
|
||||
@ -180,11 +169,6 @@ class VectorFieldIndexing : public FieldIndexing {
|
||||
const SegcoreConfig& segcore_config,
|
||||
const VectorBase* field_raw_data);
|
||||
|
||||
void
|
||||
BuildIndexRange(int64_t ack_beg,
|
||||
int64_t ack_end,
|
||||
const VectorBase* vec_base) override;
|
||||
|
||||
void
|
||||
AppendSegmentIndexDense(int64_t reserved_offset,
|
||||
int64_t size,
|
||||
|
||||
@ -6125,7 +6125,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMaris) {
|
||||
|
||||
// load index for int32 field
|
||||
auto str1_col = raw_data.get_col<std::string>(str1_fid);
|
||||
auto str1_index = milvus::index::CreateScalarIndexSort<std::string>();
|
||||
auto str1_index = milvus::index::CreateStringIndexMarisa();
|
||||
str1_index->Build(N, str1_col.data());
|
||||
load_index_info.field_id = str1_fid.get();
|
||||
load_index_info.field_type = DataType::VARCHAR;
|
||||
@ -6281,7 +6281,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMarisNullable) {
|
||||
|
||||
// load index for int32 field
|
||||
auto str1_col = raw_data.get_col<std::string>(str1_fid);
|
||||
auto str1_index = milvus::index::CreateScalarIndexSort<std::string>();
|
||||
auto str1_index = milvus::index::CreateStringIndexMarisa();
|
||||
str1_index->Build(N, str1_col.data());
|
||||
load_index_info.field_id = str1_fid.get();
|
||||
load_index_info.field_type = DataType::VARCHAR;
|
||||
@ -6438,7 +6438,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMarisNullable2) {
|
||||
|
||||
// load index for int32 field
|
||||
auto str1_col = raw_data.get_col<std::string>(str1_fid);
|
||||
auto str1_index = milvus::index::CreateScalarIndexSort<std::string>();
|
||||
auto str1_index = milvus::index::CreateStringIndexMarisa();
|
||||
str1_index->Build(N, str1_col.data());
|
||||
load_index_info.field_id = str1_fid.get();
|
||||
load_index_info.field_type = DataType::VARCHAR;
|
||||
|
||||
@ -25,6 +25,8 @@
|
||||
#include "query/ExecPlanNodeVisitor.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "test_utils/storage_test_utils.h"
|
||||
#include "index/StringIndexMarisa.h"
|
||||
|
||||
using namespace milvus;
|
||||
using namespace milvus::query;
|
||||
using namespace milvus::segcore;
|
||||
@ -178,7 +180,7 @@ TEST_F(GrowingSegmentRegexQueryTest, RegexQueryOnJsonField) {
|
||||
ASSERT_TRUE(final[4]);
|
||||
}
|
||||
|
||||
struct MockStringIndex : index::StringIndexSort {
|
||||
struct MockStringIndex : index::StringIndexMarisa {
|
||||
const bool
|
||||
HasRawData() const override {
|
||||
return true;
|
||||
@ -242,34 +244,14 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
||||
|
||||
void
|
||||
LoadStlSortIndex() {
|
||||
{
|
||||
proto::schema::StringArray arr;
|
||||
for (int64_t i = 0; i < N; i++) {
|
||||
*(arr.mutable_data()->Add()) = raw_str[i];
|
||||
}
|
||||
auto index = index::CreateStringIndexSort();
|
||||
std::vector<uint8_t> buffer(arr.ByteSizeLong());
|
||||
ASSERT_TRUE(
|
||||
arr.SerializeToArray(buffer.data(), arr.ByteSizeLong()));
|
||||
index->BuildWithRawDataForUT(arr.ByteSizeLong(), buffer.data());
|
||||
LoadIndexInfo info{
|
||||
.field_id = schema->get_field_id(FieldName("str")).get(),
|
||||
.index_params = GenIndexParams(index.get()),
|
||||
.cache_index = CreateTestCacheIndex("test", std::move(index)),
|
||||
};
|
||||
seg->LoadIndex(info);
|
||||
}
|
||||
{
|
||||
auto index = index::CreateScalarIndexSort<int64_t>();
|
||||
index->BuildWithRawDataForUT(N, raw_int.data());
|
||||
LoadIndexInfo info{
|
||||
.field_id =
|
||||
schema->get_field_id(FieldName("another_int64")).get(),
|
||||
.index_params = GenIndexParams(index.get()),
|
||||
.cache_index = CreateTestCacheIndex("test", std::move(index)),
|
||||
};
|
||||
seg->LoadIndex(info);
|
||||
}
|
||||
auto index = index::CreateScalarIndexSort<int64_t>();
|
||||
index->BuildWithRawDataForUT(N, raw_int.data());
|
||||
LoadIndexInfo info{
|
||||
.field_id = schema->get_field_id(FieldName("another_int64")).get(),
|
||||
.index_params = GenIndexParams(index.get()),
|
||||
.cache_index = CreateTestCacheIndex("test", std::move(index)),
|
||||
};
|
||||
seg->LoadIndex(info);
|
||||
}
|
||||
|
||||
void
|
||||
@ -407,35 +389,6 @@ TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnIndexedNonStringField) {
|
||||
ASSERT_ANY_THROW(ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP));
|
||||
}
|
||||
|
||||
TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnStlSortStringField) {
|
||||
std::string operand = "a%";
|
||||
const auto& str_meta = schema->operator[](FieldName("str"));
|
||||
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
|
||||
proto::schema::DataType::VarChar,
|
||||
false,
|
||||
false);
|
||||
auto unary_range_expr = test::GenUnaryRangeExpr(OpType::Match, operand);
|
||||
unary_range_expr->set_allocated_column_info(column_info);
|
||||
auto expr = test::GenExpr();
|
||||
expr->set_allocated_unary_range_expr(unary_range_expr);
|
||||
|
||||
auto parser = ProtoParser(schema);
|
||||
auto typed_expr = parser.ParseExprs(*expr);
|
||||
auto parsed =
|
||||
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
|
||||
|
||||
LoadStlSortIndex();
|
||||
|
||||
auto segpromote = dynamic_cast<ChunkedSegmentSealedImpl*>(seg.get());
|
||||
BitsetType final;
|
||||
final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP);
|
||||
ASSERT_FALSE(final[0]);
|
||||
ASSERT_TRUE(final[1]);
|
||||
ASSERT_TRUE(final[2]);
|
||||
ASSERT_TRUE(final[3]);
|
||||
ASSERT_TRUE(final[4]);
|
||||
}
|
||||
|
||||
TEST_F(SealedSegmentRegexQueryTest, PrefixMatchOnInvertedIndexStringField) {
|
||||
std::string operand = "a";
|
||||
const auto& str_meta = schema->operator[](FieldName("str"));
|
||||
|
||||
@ -376,10 +376,11 @@ TestBuildIndex(int N, int cardinality, int index_type) {
|
||||
auto index = std::make_unique<milvus::index::StringIndexMarisa>();
|
||||
index->Build(N, raw_data.data());
|
||||
return std::move(index);
|
||||
} else {
|
||||
auto index = milvus::index::CreateScalarIndexSort<T>();
|
||||
index->Build(N, raw_data.data());
|
||||
return std::move(index);
|
||||
}
|
||||
auto index = milvus::index::CreateScalarIndexSort<T>();
|
||||
index->Build(N, raw_data.data());
|
||||
return std::move(index);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -26,7 +26,6 @@
|
||||
#include "common/Schema.h"
|
||||
#include "common/Types.h"
|
||||
#include "index/ScalarIndexSort.h"
|
||||
#include "index/StringIndexSort.h"
|
||||
#include "index/VectorMemIndex.h"
|
||||
#include "segcore/Collection.h"
|
||||
#include "segcore/SegmentGrowingImpl.h"
|
||||
@ -357,7 +356,8 @@ GenerateRandomSparseFloatVector(size_t rows,
|
||||
return tensor;
|
||||
}
|
||||
|
||||
inline SchemaPtr CreateTestSchema() {
|
||||
inline SchemaPtr
|
||||
CreateTestSchema() {
|
||||
auto schema = std::make_shared<milvus::Schema>();
|
||||
auto bool_field =
|
||||
schema->AddDebugField("bool", milvus::DataType::BOOL, true);
|
||||
@ -1382,15 +1382,11 @@ GenVecIndexing(int64_t N,
|
||||
template <typename T>
|
||||
inline index::IndexBasePtr
|
||||
GenScalarIndexing(int64_t N, const T* data) {
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
auto indexing = index::CreateStringIndexSort();
|
||||
indexing->Build(N, data);
|
||||
return indexing;
|
||||
} else {
|
||||
auto indexing = index::CreateScalarIndexSort<T>();
|
||||
indexing->Build(N, data);
|
||||
return indexing;
|
||||
}
|
||||
static_assert(std::is_arithmetic_v<T>,
|
||||
"ScalarIndexSort only supports arithmetic types");
|
||||
auto indexing = index::CreateScalarIndexSort<T>();
|
||||
indexing->Build(N, data);
|
||||
return indexing;
|
||||
}
|
||||
|
||||
inline std::vector<char>
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user