diff --git a/internal/core/src/index/ScalarIndexSort.cpp b/internal/core/src/index/ScalarIndexSort.cpp index c070300c2c..c8be7fdfca 100644 --- a/internal/core/src/index/ScalarIndexSort.cpp +++ b/internal/core/src/index/ScalarIndexSort.cpp @@ -415,5 +415,4 @@ template class ScalarIndexSort; template class ScalarIndexSort; template class ScalarIndexSort; template class ScalarIndexSort; -template class ScalarIndexSort; } // namespace milvus::index diff --git a/internal/core/src/index/ScalarIndexSort.h b/internal/core/src/index/ScalarIndexSort.h index 0133fd7113..480ab87f49 100644 --- a/internal/core/src/index/ScalarIndexSort.h +++ b/internal/core/src/index/ScalarIndexSort.h @@ -31,6 +31,9 @@ namespace milvus::index { template class ScalarIndexSort : public ScalarIndex { + static_assert(std::is_arithmetic_v, + "ScalarIndexSort only supports arithmetic types"); + public: explicit ScalarIndexSort( const storage::FileManagerContext& file_manager_context = diff --git a/internal/core/src/index/StringIndexSort.h b/internal/core/src/index/StringIndexSort.h deleted file mode 100644 index 1fcc3d1260..0000000000 --- a/internal/core/src/index/StringIndexSort.h +++ /dev/null @@ -1,65 +0,0 @@ -// Licensed to the LF AI & Data foundation under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "common/Utils.h" -#include "index/ScalarIndexSort.h" -#include "index/StringIndex.h" - -namespace milvus::index { -// TODO: should inherit from StringIndex? -class StringIndexSort : public ScalarIndexSort { - public: - const TargetBitmap - Query(const DatasetPtr& dataset) override { - auto op = dataset->Get(OPERATOR_TYPE); - if (op == OpType::PrefixMatch) { - auto prefix = dataset->Get(MATCH_VALUE); - return PrefixMatch(prefix); - } - return ScalarIndex::Query(dataset); - } - - const TargetBitmap - PrefixMatch(std::string_view prefix) { - auto data = GetData(); - TargetBitmap bitset(data.size()); - auto it = std::lower_bound( - data.begin(), - data.end(), - prefix, - [](const IndexStructure& value, - std::string_view prefix) { return value.a_ < prefix; }); - for (; it != data.end(); ++it) { - if (!milvus::PrefixMatch(it->a_, prefix)) { - break; - } - bitset[it->idx_] = true; - } - return bitset; - } -}; -using StringIndexSortPtr = std::unique_ptr; - -inline StringIndexSortPtr -CreateStringIndexSort() { - return std::make_unique(); -} -} // namespace milvus::index diff --git a/internal/core/src/query/ScalarIndex.h b/internal/core/src/query/ScalarIndex.h index b72a68c6d5..6416fffc9c 100644 --- a/internal/core/src/query/ScalarIndex.h +++ b/internal/core/src/query/ScalarIndex.h @@ -14,7 +14,6 @@ #include #include #include "index/ScalarIndexSort.h" -#include "index/StringIndexSort.h" #include "common/FieldMeta.h" #include "common/Span.h" @@ -30,14 +29,6 @@ generate_scalar_index(Span data) { return indexing; } -template <> -inline index::ScalarIndexPtr -generate_scalar_index(Span data) { - auto indexing = index::CreateStringIndexSort(); - indexing->Build(data.row_count(), data.data(), data.valid_data()); - return indexing; -} - inline index::IndexBasePtr generate_scalar_index(SpanBase data, DataType data_type) { Assert(!IsVectorDataType(data_type)); diff --git a/internal/core/src/segcore/FieldIndexing.cpp b/internal/core/src/segcore/FieldIndexing.cpp index ce6834fcf9..06a8c812ba 100644 --- a/internal/core/src/segcore/FieldIndexing.cpp +++ b/internal/core/src/segcore/FieldIndexing.cpp @@ -15,7 +15,6 @@ #include "common/EasyAssert.h" #include "fmt/format.h" #include "index/ScalarIndexSort.h" -#include "index/StringIndexSort.h" #include "common/SystemProperty.h" #include "segcore/FieldIndexing.h" @@ -101,50 +100,6 @@ VectorFieldIndexing::recreate_index(DataType data_type, } } -void -VectorFieldIndexing::BuildIndexRange(int64_t ack_beg, - int64_t ack_end, - const VectorBase* vec_base) { - // No BuildIndexRange support for sparse vector. - AssertInfo(field_meta_.get_data_type() == DataType::VECTOR_FLOAT || - field_meta_.get_data_type() == DataType::VECTOR_FLOAT16 || - field_meta_.get_data_type() == DataType::VECTOR_BFLOAT16, - "Data type of vector field is not in (VECTOR_FLOAT, " - "VECTOR_FLOAT16,VECTOR_BFLOAT16)"); - auto dim = field_meta_.get_dim(); - AssertInfo( - ConcurrentDenseVectorCheck(vec_base, field_meta_.get_data_type()), - "vec_base can't cast to ConcurrentVector type"); - auto num_chunk = vec_base->num_chunk(); - AssertInfo(ack_end <= num_chunk, "ack_end is bigger than num_chunk"); - auto conf = get_build_params(field_meta_.get_data_type()); - data_.grow_to_at_least(ack_end); - for (int chunk_id = ack_beg; chunk_id < ack_end; chunk_id++) { - const auto& chunk_data = vec_base->get_chunk_data(chunk_id); - std::unique_ptr indexing = nullptr; - if (field_meta_.get_data_type() == DataType::VECTOR_FLOAT) { - indexing = std::make_unique>( - knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, - knowhere::metric::L2, - knowhere::Version::GetCurrentVersion().VersionNumber()); - } else if (field_meta_.get_data_type() == DataType::VECTOR_FLOAT16) { - indexing = std::make_unique>( - knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, - knowhere::metric::L2, - knowhere::Version::GetCurrentVersion().VersionNumber()); - } else { - indexing = std::make_unique>( - knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, - knowhere::metric::L2, - knowhere::Version::GetCurrentVersion().VersionNumber()); - } - auto dataset = knowhere::GenDataSet( - vec_base->get_size_per_chunk(), dim, chunk_data); - indexing->BuildWithDataset(dataset, conf); - data_[chunk_id] = std::move(indexing); - } -} - // for sparse float vector: // * element_size is not used // * output_raw pooints at a milvus::schema::proto::SparseFloatArray. @@ -366,35 +321,6 @@ VectorFieldIndexing::has_raw_data() const { return index_->HasRawData(); } -template -void -ScalarFieldIndexing::BuildIndexRange(int64_t ack_beg, - int64_t ack_end, - const VectorBase* vec_base) { - auto source = dynamic_cast*>(vec_base); - AssertInfo(source, "vec_base can't cast to ConcurrentVector type"); - auto num_chunk = source->num_chunk(); - AssertInfo(ack_end <= num_chunk, "Ack_end is bigger than num_chunk"); - data_.grow_to_at_least(ack_end); - for (int chunk_id = ack_beg; chunk_id < ack_end; chunk_id++) { - auto chunk_data = source->get_chunk_data(chunk_id); - // build index for chunk - // seem no lint, not pass valid_data here - // TODO - if constexpr (std::is_same_v) { - auto indexing = index::CreateStringIndexSort(); - indexing->Build(vec_base->get_size_per_chunk(), - static_cast(chunk_data)); - data_[chunk_id] = std::move(indexing); - } else { - auto indexing = index::CreateScalarIndexSort(); - indexing->Build(vec_base->get_size_per_chunk(), - static_cast(chunk_data)); - data_[chunk_id] = std::move(indexing); - } - } -} - std::unique_ptr CreateIndex(const FieldMeta& field_meta, const FieldIndexMeta& field_index_meta, diff --git a/internal/core/src/segcore/FieldIndexing.h b/internal/core/src/segcore/FieldIndexing.h index 95193b5df7..f8c57f820b 100644 --- a/internal/core/src/segcore/FieldIndexing.h +++ b/internal/core/src/segcore/FieldIndexing.h @@ -47,12 +47,6 @@ class FieldIndexing { operator=(const FieldIndexing&) = delete; virtual ~FieldIndexing() = default; - // Do this in parallel - virtual void - BuildIndexRange(int64_t ack_beg, - int64_t ack_end, - const VectorBase* vec_base) = 0; - virtual void AppendSegmentIndexDense(int64_t reserved_offset, int64_t size, @@ -111,11 +105,6 @@ class ScalarFieldIndexing : public FieldIndexing { public: using FieldIndexing::FieldIndexing; - void - BuildIndexRange(int64_t ack_beg, - int64_t ack_end, - const VectorBase* vec_base) override; - void AppendSegmentIndexDense(int64_t reserved_offset, int64_t size, @@ -180,11 +169,6 @@ class VectorFieldIndexing : public FieldIndexing { const SegcoreConfig& segcore_config, const VectorBase* field_raw_data); - void - BuildIndexRange(int64_t ack_beg, - int64_t ack_end, - const VectorBase* vec_base) override; - void AppendSegmentIndexDense(int64_t reserved_offset, int64_t size, diff --git a/internal/core/unittest/test_expr.cpp b/internal/core/unittest/test_expr.cpp index 21202fd8bf..41b1ce66cb 100644 --- a/internal/core/unittest/test_expr.cpp +++ b/internal/core/unittest/test_expr.cpp @@ -6125,7 +6125,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMaris) { // load index for int32 field auto str1_col = raw_data.get_col(str1_fid); - auto str1_index = milvus::index::CreateScalarIndexSort(); + auto str1_index = milvus::index::CreateStringIndexMarisa(); str1_index->Build(N, str1_col.data()); load_index_info.field_id = str1_fid.get(); load_index_info.field_type = DataType::VARCHAR; @@ -6281,7 +6281,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMarisNullable) { // load index for int32 field auto str1_col = raw_data.get_col(str1_fid); - auto str1_index = milvus::index::CreateScalarIndexSort(); + auto str1_index = milvus::index::CreateStringIndexMarisa(); str1_index->Build(N, str1_col.data()); load_index_info.field_id = str1_fid.get(); load_index_info.field_type = DataType::VARCHAR; @@ -6438,7 +6438,7 @@ TEST_P(ExprTest, TestCompareWithScalarIndexMarisNullable2) { // load index for int32 field auto str1_col = raw_data.get_col(str1_fid); - auto str1_index = milvus::index::CreateScalarIndexSort(); + auto str1_index = milvus::index::CreateStringIndexMarisa(); str1_index->Build(N, str1_col.data()); load_index_info.field_id = str1_fid.get(); load_index_info.field_type = DataType::VARCHAR; diff --git a/internal/core/unittest/test_regex_query.cpp b/internal/core/unittest/test_regex_query.cpp index d2d3224417..4ef44971c8 100644 --- a/internal/core/unittest/test_regex_query.cpp +++ b/internal/core/unittest/test_regex_query.cpp @@ -25,6 +25,8 @@ #include "query/ExecPlanNodeVisitor.h" #include "index/InvertedIndexTantivy.h" #include "test_utils/storage_test_utils.h" +#include "index/StringIndexMarisa.h" + using namespace milvus; using namespace milvus::query; using namespace milvus::segcore; @@ -178,7 +180,7 @@ TEST_F(GrowingSegmentRegexQueryTest, RegexQueryOnJsonField) { ASSERT_TRUE(final[4]); } -struct MockStringIndex : index::StringIndexSort { +struct MockStringIndex : index::StringIndexMarisa { const bool HasRawData() const override { return true; @@ -242,34 +244,14 @@ class SealedSegmentRegexQueryTest : public ::testing::Test { void LoadStlSortIndex() { - { - proto::schema::StringArray arr; - for (int64_t i = 0; i < N; i++) { - *(arr.mutable_data()->Add()) = raw_str[i]; - } - auto index = index::CreateStringIndexSort(); - std::vector buffer(arr.ByteSizeLong()); - ASSERT_TRUE( - arr.SerializeToArray(buffer.data(), arr.ByteSizeLong())); - index->BuildWithRawDataForUT(arr.ByteSizeLong(), buffer.data()); - LoadIndexInfo info{ - .field_id = schema->get_field_id(FieldName("str")).get(), - .index_params = GenIndexParams(index.get()), - .cache_index = CreateTestCacheIndex("test", std::move(index)), - }; - seg->LoadIndex(info); - } - { - auto index = index::CreateScalarIndexSort(); - index->BuildWithRawDataForUT(N, raw_int.data()); - LoadIndexInfo info{ - .field_id = - schema->get_field_id(FieldName("another_int64")).get(), - .index_params = GenIndexParams(index.get()), - .cache_index = CreateTestCacheIndex("test", std::move(index)), - }; - seg->LoadIndex(info); - } + auto index = index::CreateScalarIndexSort(); + index->BuildWithRawDataForUT(N, raw_int.data()); + LoadIndexInfo info{ + .field_id = schema->get_field_id(FieldName("another_int64")).get(), + .index_params = GenIndexParams(index.get()), + .cache_index = CreateTestCacheIndex("test", std::move(index)), + }; + seg->LoadIndex(info); } void @@ -407,35 +389,6 @@ TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnIndexedNonStringField) { ASSERT_ANY_THROW(ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP)); } -TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnStlSortStringField) { - std::string operand = "a%"; - const auto& str_meta = schema->operator[](FieldName("str")); - auto column_info = test::GenColumnInfo(str_meta.get_id().get(), - proto::schema::DataType::VarChar, - false, - false); - auto unary_range_expr = test::GenUnaryRangeExpr(OpType::Match, operand); - unary_range_expr->set_allocated_column_info(column_info); - auto expr = test::GenExpr(); - expr->set_allocated_unary_range_expr(unary_range_expr); - - auto parser = ProtoParser(schema); - auto typed_expr = parser.ParseExprs(*expr); - auto parsed = - std::make_shared(DEFAULT_PLANNODE_ID, typed_expr); - - LoadStlSortIndex(); - - auto segpromote = dynamic_cast(seg.get()); - BitsetType final; - final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP); - ASSERT_FALSE(final[0]); - ASSERT_TRUE(final[1]); - ASSERT_TRUE(final[2]); - ASSERT_TRUE(final[3]); - ASSERT_TRUE(final[4]); -} - TEST_F(SealedSegmentRegexQueryTest, PrefixMatchOnInvertedIndexStringField) { std::string operand = "a"; const auto& str_meta = schema->operator[](FieldName("str")); diff --git a/internal/core/unittest/test_scalar_index.cpp b/internal/core/unittest/test_scalar_index.cpp index 8ca5067d00..1e2cb2352b 100644 --- a/internal/core/unittest/test_scalar_index.cpp +++ b/internal/core/unittest/test_scalar_index.cpp @@ -376,10 +376,11 @@ TestBuildIndex(int N, int cardinality, int index_type) { auto index = std::make_unique(); index->Build(N, raw_data.data()); return std::move(index); + } else { + auto index = milvus::index::CreateScalarIndexSort(); + index->Build(N, raw_data.data()); + return std::move(index); } - auto index = milvus::index::CreateScalarIndexSort(); - index->Build(N, raw_data.data()); - return std::move(index); } } diff --git a/internal/core/unittest/test_utils/DataGen.h b/internal/core/unittest/test_utils/DataGen.h index f54a15fdf7..b81d950505 100644 --- a/internal/core/unittest/test_utils/DataGen.h +++ b/internal/core/unittest/test_utils/DataGen.h @@ -26,7 +26,6 @@ #include "common/Schema.h" #include "common/Types.h" #include "index/ScalarIndexSort.h" -#include "index/StringIndexSort.h" #include "index/VectorMemIndex.h" #include "segcore/Collection.h" #include "segcore/SegmentGrowingImpl.h" @@ -357,7 +356,8 @@ GenerateRandomSparseFloatVector(size_t rows, return tensor; } -inline SchemaPtr CreateTestSchema() { +inline SchemaPtr +CreateTestSchema() { auto schema = std::make_shared(); auto bool_field = schema->AddDebugField("bool", milvus::DataType::BOOL, true); @@ -1382,15 +1382,11 @@ GenVecIndexing(int64_t N, template inline index::IndexBasePtr GenScalarIndexing(int64_t N, const T* data) { - if constexpr (std::is_same_v) { - auto indexing = index::CreateStringIndexSort(); - indexing->Build(N, data); - return indexing; - } else { - auto indexing = index::CreateScalarIndexSort(); - indexing->Build(N, data); - return indexing; - } + static_assert(std::is_arithmetic_v, + "ScalarIndexSort only supports arithmetic types"); + auto indexing = index::CreateScalarIndexSort(); + indexing->Build(N, data); + return indexing; } inline std::vector