fix: remove std::string support for stlsort index (#43355)

fix: https://github.com/milvus-io/milvus/issues/43354

The current implementation of stdsort index is not supported for
std::string. Remove the code.

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
Spade A 2025-07-16 17:46:51 +08:00 committed by GitHub
parent 5d90b65342
commit d750816ba0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 28 additions and 240 deletions

View File

@ -415,5 +415,4 @@ template class ScalarIndexSort<int32_t>;
template class ScalarIndexSort<int64_t>;
template class ScalarIndexSort<float>;
template class ScalarIndexSort<double>;
template class ScalarIndexSort<std::string>;
} // namespace milvus::index

View File

@ -31,6 +31,9 @@ namespace milvus::index {
template <typename T>
class ScalarIndexSort : public ScalarIndex<T> {
static_assert(std::is_arithmetic_v<T>,
"ScalarIndexSort only supports arithmetic types");
public:
explicit ScalarIndexSort(
const storage::FileManagerContext& file_manager_context =

View File

@ -1,65 +0,0 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include <string>
#include <string_view>
#include "common/Utils.h"
#include "index/ScalarIndexSort.h"
#include "index/StringIndex.h"
namespace milvus::index {
// TODO: should inherit from StringIndex?
class StringIndexSort : public ScalarIndexSort<std::string> {
public:
const TargetBitmap
Query(const DatasetPtr& dataset) override {
auto op = dataset->Get<OpType>(OPERATOR_TYPE);
if (op == OpType::PrefixMatch) {
auto prefix = dataset->Get<std::string>(MATCH_VALUE);
return PrefixMatch(prefix);
}
return ScalarIndex<std::string>::Query(dataset);
}
const TargetBitmap
PrefixMatch(std::string_view prefix) {
auto data = GetData();
TargetBitmap bitset(data.size());
auto it = std::lower_bound(
data.begin(),
data.end(),
prefix,
[](const IndexStructure<std::string>& value,
std::string_view prefix) { return value.a_ < prefix; });
for (; it != data.end(); ++it) {
if (!milvus::PrefixMatch(it->a_, prefix)) {
break;
}
bitset[it->idx_] = true;
}
return bitset;
}
};
using StringIndexSortPtr = std::unique_ptr<StringIndexSort>;
inline StringIndexSortPtr
CreateStringIndexSort() {
return std::make_unique<StringIndexSort>();
}
} // namespace milvus::index

View File

@ -14,7 +14,6 @@
#include <memory>
#include <string>
#include "index/ScalarIndexSort.h"
#include "index/StringIndexSort.h"
#include "common/FieldMeta.h"
#include "common/Span.h"
@ -30,14 +29,6 @@ generate_scalar_index(Span<T> data) {
return indexing;
}
template <>
inline index::ScalarIndexPtr<std::string>
generate_scalar_index(Span<std::string> data) {
auto indexing = index::CreateStringIndexSort();
indexing->Build(data.row_count(), data.data(), data.valid_data());
return indexing;
}
inline index::IndexBasePtr
generate_scalar_index(SpanBase data, DataType data_type) {
Assert(!IsVectorDataType(data_type));

View File

@ -15,7 +15,6 @@
#include "common/EasyAssert.h"
#include "fmt/format.h"
#include "index/ScalarIndexSort.h"
#include "index/StringIndexSort.h"
#include "common/SystemProperty.h"
#include "segcore/FieldIndexing.h"
@ -101,50 +100,6 @@ VectorFieldIndexing::recreate_index(DataType data_type,
}
}
void
VectorFieldIndexing::BuildIndexRange(int64_t ack_beg,
int64_t ack_end,
const VectorBase* vec_base) {
// No BuildIndexRange support for sparse vector.
AssertInfo(field_meta_.get_data_type() == DataType::VECTOR_FLOAT ||
field_meta_.get_data_type() == DataType::VECTOR_FLOAT16 ||
field_meta_.get_data_type() == DataType::VECTOR_BFLOAT16,
"Data type of vector field is not in (VECTOR_FLOAT, "
"VECTOR_FLOAT16,VECTOR_BFLOAT16)");
auto dim = field_meta_.get_dim();
AssertInfo(
ConcurrentDenseVectorCheck(vec_base, field_meta_.get_data_type()),
"vec_base can't cast to ConcurrentVector type");
auto num_chunk = vec_base->num_chunk();
AssertInfo(ack_end <= num_chunk, "ack_end is bigger than num_chunk");
auto conf = get_build_params(field_meta_.get_data_type());
data_.grow_to_at_least(ack_end);
for (int chunk_id = ack_beg; chunk_id < ack_end; chunk_id++) {
const auto& chunk_data = vec_base->get_chunk_data(chunk_id);
std::unique_ptr<index::VectorIndex> indexing = nullptr;
if (field_meta_.get_data_type() == DataType::VECTOR_FLOAT) {
indexing = std::make_unique<index::VectorMemIndex<float>>(
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
knowhere::metric::L2,
knowhere::Version::GetCurrentVersion().VersionNumber());
} else if (field_meta_.get_data_type() == DataType::VECTOR_FLOAT16) {
indexing = std::make_unique<index::VectorMemIndex<float16>>(
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
knowhere::metric::L2,
knowhere::Version::GetCurrentVersion().VersionNumber());
} else {
indexing = std::make_unique<index::VectorMemIndex<bfloat16>>(
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
knowhere::metric::L2,
knowhere::Version::GetCurrentVersion().VersionNumber());
}
auto dataset = knowhere::GenDataSet(
vec_base->get_size_per_chunk(), dim, chunk_data);
indexing->BuildWithDataset(dataset, conf);
data_[chunk_id] = std::move(indexing);
}
}
// for sparse float vector:
// * element_size is not used
// * output_raw pooints at a milvus::schema::proto::SparseFloatArray.
@ -366,35 +321,6 @@ VectorFieldIndexing::has_raw_data() const {
return index_->HasRawData();
}
template <typename T>
void
ScalarFieldIndexing<T>::BuildIndexRange(int64_t ack_beg,
int64_t ack_end,
const VectorBase* vec_base) {
auto source = dynamic_cast<const ConcurrentVector<T>*>(vec_base);
AssertInfo(source, "vec_base can't cast to ConcurrentVector type");
auto num_chunk = source->num_chunk();
AssertInfo(ack_end <= num_chunk, "Ack_end is bigger than num_chunk");
data_.grow_to_at_least(ack_end);
for (int chunk_id = ack_beg; chunk_id < ack_end; chunk_id++) {
auto chunk_data = source->get_chunk_data(chunk_id);
// build index for chunk
// seem no lint, not pass valid_data here
// TODO
if constexpr (std::is_same_v<T, std::string>) {
auto indexing = index::CreateStringIndexSort();
indexing->Build(vec_base->get_size_per_chunk(),
static_cast<const T*>(chunk_data));
data_[chunk_id] = std::move(indexing);
} else {
auto indexing = index::CreateScalarIndexSort<T>();
indexing->Build(vec_base->get_size_per_chunk(),
static_cast<const T*>(chunk_data));
data_[chunk_id] = std::move(indexing);
}
}
}
std::unique_ptr<FieldIndexing>
CreateIndex(const FieldMeta& field_meta,
const FieldIndexMeta& field_index_meta,

View File

@ -47,12 +47,6 @@ class FieldIndexing {
operator=(const FieldIndexing&) = delete;
virtual ~FieldIndexing() = default;
// Do this in parallel
virtual void
BuildIndexRange(int64_t ack_beg,
int64_t ack_end,
const VectorBase* vec_base) = 0;
virtual void
AppendSegmentIndexDense(int64_t reserved_offset,
int64_t size,
@ -111,11 +105,6 @@ class ScalarFieldIndexing : public FieldIndexing {
public:
using FieldIndexing::FieldIndexing;
void
BuildIndexRange(int64_t ack_beg,
int64_t ack_end,
const VectorBase* vec_base) override;
void
AppendSegmentIndexDense(int64_t reserved_offset,
int64_t size,
@ -180,11 +169,6 @@ class VectorFieldIndexing : public FieldIndexing {
const SegcoreConfig& segcore_config,
const VectorBase* field_raw_data);
void
BuildIndexRange(int64_t ack_beg,
int64_t ack_end,
const VectorBase* vec_base) override;
void
AppendSegmentIndexDense(int64_t reserved_offset,
int64_t size,

View File

@ -6125,7 +6125,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMaris) {
// load index for int32 field
auto str1_col = raw_data.get_col<std::string>(str1_fid);
auto str1_index = milvus::index::CreateScalarIndexSort<std::string>();
auto str1_index = milvus::index::CreateStringIndexMarisa();
str1_index->Build(N, str1_col.data());
load_index_info.field_id = str1_fid.get();
load_index_info.field_type = DataType::VARCHAR;
@ -6281,7 +6281,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMarisNullable) {
// load index for int32 field
auto str1_col = raw_data.get_col<std::string>(str1_fid);
auto str1_index = milvus::index::CreateScalarIndexSort<std::string>();
auto str1_index = milvus::index::CreateStringIndexMarisa();
str1_index->Build(N, str1_col.data());
load_index_info.field_id = str1_fid.get();
load_index_info.field_type = DataType::VARCHAR;
@ -6438,7 +6438,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMarisNullable2) {
// load index for int32 field
auto str1_col = raw_data.get_col<std::string>(str1_fid);
auto str1_index = milvus::index::CreateScalarIndexSort<std::string>();
auto str1_index = milvus::index::CreateStringIndexMarisa();
str1_index->Build(N, str1_col.data());
load_index_info.field_id = str1_fid.get();
load_index_info.field_type = DataType::VARCHAR;

View File

@ -25,6 +25,8 @@
#include "query/ExecPlanNodeVisitor.h"
#include "index/InvertedIndexTantivy.h"
#include "test_utils/storage_test_utils.h"
#include "index/StringIndexMarisa.h"
using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
@ -178,7 +180,7 @@ TEST_F(GrowingSegmentRegexQueryTest, RegexQueryOnJsonField) {
ASSERT_TRUE(final[4]);
}
struct MockStringIndex : index::StringIndexSort {
struct MockStringIndex : index::StringIndexMarisa {
const bool
HasRawData() const override {
return true;
@ -242,34 +244,14 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
void
LoadStlSortIndex() {
{
proto::schema::StringArray arr;
for (int64_t i = 0; i < N; i++) {
*(arr.mutable_data()->Add()) = raw_str[i];
}
auto index = index::CreateStringIndexSort();
std::vector<uint8_t> buffer(arr.ByteSizeLong());
ASSERT_TRUE(
arr.SerializeToArray(buffer.data(), arr.ByteSizeLong()));
index->BuildWithRawDataForUT(arr.ByteSizeLong(), buffer.data());
LoadIndexInfo info{
.field_id = schema->get_field_id(FieldName("str")).get(),
.index_params = GenIndexParams(index.get()),
.cache_index = CreateTestCacheIndex("test", std::move(index)),
};
seg->LoadIndex(info);
}
{
auto index = index::CreateScalarIndexSort<int64_t>();
index->BuildWithRawDataForUT(N, raw_int.data());
LoadIndexInfo info{
.field_id =
schema->get_field_id(FieldName("another_int64")).get(),
.index_params = GenIndexParams(index.get()),
.cache_index = CreateTestCacheIndex("test", std::move(index)),
};
seg->LoadIndex(info);
}
auto index = index::CreateScalarIndexSort<int64_t>();
index->BuildWithRawDataForUT(N, raw_int.data());
LoadIndexInfo info{
.field_id = schema->get_field_id(FieldName("another_int64")).get(),
.index_params = GenIndexParams(index.get()),
.cache_index = CreateTestCacheIndex("test", std::move(index)),
};
seg->LoadIndex(info);
}
void
@ -407,35 +389,6 @@ TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnIndexedNonStringField) {
ASSERT_ANY_THROW(ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP));
}
TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnStlSortStringField) {
std::string operand = "a%";
const auto& str_meta = schema->operator[](FieldName("str"));
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr = test::GenUnaryRangeExpr(OpType::Match, operand);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
LoadStlSortIndex();
auto segpromote = dynamic_cast<ChunkedSegmentSealedImpl*>(seg.get());
BitsetType final;
final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_TRUE(final[2]);
ASSERT_TRUE(final[3]);
ASSERT_TRUE(final[4]);
}
TEST_F(SealedSegmentRegexQueryTest, PrefixMatchOnInvertedIndexStringField) {
std::string operand = "a";
const auto& str_meta = schema->operator[](FieldName("str"));

View File

@ -376,10 +376,11 @@ TestBuildIndex(int N, int cardinality, int index_type) {
auto index = std::make_unique<milvus::index::StringIndexMarisa>();
index->Build(N, raw_data.data());
return std::move(index);
} else {
auto index = milvus::index::CreateScalarIndexSort<T>();
index->Build(N, raw_data.data());
return std::move(index);
}
auto index = milvus::index::CreateScalarIndexSort<T>();
index->Build(N, raw_data.data());
return std::move(index);
}
}

View File

@ -26,7 +26,6 @@
#include "common/Schema.h"
#include "common/Types.h"
#include "index/ScalarIndexSort.h"
#include "index/StringIndexSort.h"
#include "index/VectorMemIndex.h"
#include "segcore/Collection.h"
#include "segcore/SegmentGrowingImpl.h"
@ -357,7 +356,8 @@ GenerateRandomSparseFloatVector(size_t rows,
return tensor;
}
inline SchemaPtr CreateTestSchema() {
inline SchemaPtr
CreateTestSchema() {
auto schema = std::make_shared<milvus::Schema>();
auto bool_field =
schema->AddDebugField("bool", milvus::DataType::BOOL, true);
@ -1382,15 +1382,11 @@ GenVecIndexing(int64_t N,
template <typename T>
inline index::IndexBasePtr
GenScalarIndexing(int64_t N, const T* data) {
if constexpr (std::is_same_v<T, std::string>) {
auto indexing = index::CreateStringIndexSort();
indexing->Build(N, data);
return indexing;
} else {
auto indexing = index::CreateScalarIndexSort<T>();
indexing->Build(N, data);
return indexing;
}
static_assert(std::is_arithmetic_v<T>,
"ScalarIndexSort only supports arithmetic types");
auto indexing = index::CreateScalarIndexSort<T>();
indexing->Build(N, data);
return indexing;
}
inline std::vector<char>