From 39e7ad33d7eced450f13a25d09de187b9acea0bc Mon Sep 17 00:00:00 2001 From: zhagnlu <1542303831@qq.com> Date: Thu, 8 May 2025 14:28:52 +0800 Subject: [PATCH] enhance: add optimize for like expr (#41066) #41065 Signed-off-by: luzhang Co-authored-by: luzhang --- internal/core/src/common/Utils.h | 8 + internal/core/src/exec/expression/Expr.cpp | 1 + .../core/src/exec/expression/UnaryExpr.cpp | 69 +++++++++ internal/core/src/exec/expression/UnaryExpr.h | 88 ++++++----- internal/core/src/index/BitmapIndex.cpp | 62 ++++---- internal/core/src/index/BitmapIndex.h | 24 ++- internal/core/src/index/HybridScalarIndex.h | 11 +- .../core/src/index/InvertedIndexTantivy.cpp | 2 +- .../core/src/index/InvertedIndexTantivy.h | 30 +++- internal/core/src/index/Meta.h | 2 +- internal/core/src/index/ScalarIndex.h | 2 +- internal/core/src/index/StringIndex.h | 2 +- internal/core/src/index/StringIndexSort.h | 2 +- internal/core/src/query/Utils.h | 4 + internal/core/unittest/test_expr.cpp | 54 +++++++ .../core/unittest/test_inverted_index.cpp | 2 +- internal/core/unittest/test_regex_query.cpp | 90 +++++++++++ internal/core/unittest/test_string_index.cpp | 2 +- internal/core/unittest/test_utils.cpp | 25 ++- internal/parser/planparserv2/pattern_match.go | 107 +++++++------ .../parser/planparserv2/pattern_match_test.go | 143 +++++++----------- pkg/proto/plan.proto | 1 + pkg/proto/planpb/plan.pb.go | 8 +- tests/python_client/testcases/test_query.py | 122 ++++++++++++++- 24 files changed, 626 insertions(+), 235 deletions(-) diff --git a/internal/core/src/common/Utils.h b/internal/core/src/common/Utils.h index a35cafda60..d30ecd6cc8 100644 --- a/internal/core/src/common/Utils.h +++ b/internal/core/src/common/Utils.h @@ -138,6 +138,14 @@ PostfixMatch(const std::string_view str, const std::string_view postfix) { return true; } +inline bool +InnerMatch(const std::string_view str, const std::string_view pattern) { + if (pattern.length() > str.length()) { + return false; + } + return str.find(pattern) != std::string::npos; +} + inline int64_t upper_align(int64_t value, int64_t align) { Assert(align > 0); diff --git a/internal/core/src/exec/expression/Expr.cpp b/internal/core/src/exec/expression/Expr.cpp index ba9ce61cee..b28c7f15ec 100644 --- a/internal/core/src/exec/expression/Expr.cpp +++ b/internal/core/src/exec/expression/Expr.cpp @@ -321,6 +321,7 @@ IsLikeExpr(std::shared_ptr input) { switch (optype) { case proto::plan::PrefixMatch: case proto::plan::PostfixMatch: + case proto::plan::InnerMatch: case proto::plan::Match: return true; default: diff --git a/internal/core/src/exec/expression/UnaryExpr.cpp b/internal/core/src/exec/expression/UnaryExpr.cpp index f0776434d3..0231b13a23 100644 --- a/internal/core/src/exec/expression/UnaryExpr.cpp +++ b/internal/core/src/exec/expression/UnaryExpr.cpp @@ -468,6 +468,40 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray(EvalCtx& context) { offsets); break; } + case proto::plan::PostfixMatch: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + bitmap_input, + processed_cursor, + offsets); + break; + } + case proto::plan::InnerMatch: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + bitmap_input, + processed_cursor, + offsets); + break; + } default: PanicInfo( OpTypeInvalid, @@ -831,6 +865,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) { } break; } + case proto::plan::InnerMatch: + case proto::plan::PostfixMatch: case proto::plan::PrefixMatch: { for (size_t i = 0; i < size; ++i) { auto offset = i; @@ -1483,6 +1519,16 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForIndex() { res = std::move(func(index_ptr, val)); break; } + case proto::plan::PostfixMatch: { + UnaryIndexFunc func; + res = std::move(func(index_ptr, val)); + break; + } + case proto::plan::InnerMatch: { + UnaryIndexFunc func; + res = std::move(func(index_ptr, val)); + break; + } case proto::plan::Match: { UnaryIndexFunc func; res = std::move(func(index_ptr, val)); @@ -1690,6 +1736,29 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData(EvalCtx& context) { offsets); break; } + case proto::plan::PostfixMatch: { + UnaryElementFunc + func; + func(data, + size, + val, + res, + bitmap_input, + processed_cursor, + offsets); + break; + } + case proto::plan::InnerMatch: { + UnaryElementFunc func; + func(data, + size, + val, + res, + bitmap_input, + processed_cursor, + offsets); + break; + } case proto::plan::Match: { UnaryElementFunc func; func(data, diff --git a/internal/core/src/exec/expression/UnaryExpr.h b/internal/core/src/exec/expression/UnaryExpr.h index a82884af0a..01f4e67823 100644 --- a/internal/core/src/exec/expression/UnaryExpr.h +++ b/internal/core/src/exec/expression/UnaryExpr.h @@ -106,9 +106,10 @@ struct UnaryElementFunc { res[i] = src[offset] >= val; } else if constexpr (op == proto::plan::OpType::LessEqual) { res[i] = src[offset] <= val; - } else if constexpr (op == proto::plan::OpType::PrefixMatch) { - res[i] = milvus::query::Match( - src[offset], val, proto::plan::OpType::PrefixMatch); + } else if constexpr (op == proto::plan::OpType::PrefixMatch || + op == proto::plan::OpType::PostfixMatch || + op == proto::plan::OpType::InnerMatch) { + res[i] = milvus::query::Match(src[offset], val, op); } else { PanicInfo( OpTypeInvalid, @@ -119,12 +120,7 @@ struct UnaryElementFunc { return; } - if constexpr (op == proto::plan::OpType::PrefixMatch) { - for (int i = 0; i < size; ++i) { - res[i] = milvus::query::Match( - src[i], val, proto::plan::OpType::PrefixMatch); - } - } else if constexpr (op == proto::plan::OpType::Equal) { + if constexpr (op == proto::plan::OpType::Equal) { res.inplace_compare_val( src, size, val); } else if constexpr (op == proto::plan::OpType::NotEqual) { @@ -225,7 +221,9 @@ struct UnaryElementFuncForArray { UnaryArrayCompare(array_data >= val); } else if constexpr (op == proto::plan::OpType::LessEqual) { UnaryArrayCompare(array_data <= val); - } else if constexpr (op == proto::plan::OpType::PrefixMatch) { + } else if constexpr (op == proto::plan::OpType::PrefixMatch || + op == proto::plan::OpType::PostfixMatch || + op == proto::plan::OpType::InnerMatch) { UnaryArrayCompare(milvus::query::Match(array_data, val, op)); } else if constexpr (op == proto::plan::OpType::Match) { if constexpr (std::is_same_v) { @@ -258,36 +256,57 @@ struct UnaryIndexFuncForMatch { std::conditional_t, std::string, T>; using Index = index::ScalarIndex; TargetBitmap - operator()(Index* index, IndexInnerType val) { - if constexpr (!std::is_same_v && - !std::is_same_v) { - PanicInfo(Unsupported, "regex query is only supported on string"); - } else { - if (index->SupportRegexQuery()) { - return index->PatternMatch(val); + operator()(Index* index, IndexInnerType val, proto::plan::OpType op) { + AssertInfo(op == proto::plan::OpType::Match || + op == proto::plan::OpType::PostfixMatch || + op == proto::plan::OpType::InnerMatch || + op == proto::plan::OpType::PrefixMatch, + "op must be one of the following: Match, PrefixMatch, " + "PostfixMatch, InnerMatch"); + + if constexpr (std::is_same_v || + std::is_same_v) { + if (index->SupportPatternMatch()) { + return index->PatternMatch(val, op); } + if (!index->HasRawData()) { PanicInfo(Unsupported, "index don't support regex query and don't have " "raw data"); } - // retrieve raw data to do brute force query, may be very slow. auto cnt = index->Count(); TargetBitmap res(cnt); - PatternMatchTranslator translator; - auto regex_pattern = translator(val); - RegexMatcher matcher(regex_pattern); - for (int64_t i = 0; i < cnt; i++) { - auto raw = index->Reverse_Lookup(i); - if (!raw.has_value()) { - res[i] = false; - continue; + if (op == proto::plan::OpType::InnerMatch || + op == proto::plan::OpType::PostfixMatch || + op == proto::plan::OpType::PrefixMatch) { + for (int64_t i = 0; i < cnt; i++) { + auto raw = index->Reverse_Lookup(i); + if (!raw.has_value()) { + res[i] = false; + continue; + } + res[i] = milvus::query::Match(raw.value(), val, op); } - res[i] = matcher(raw.value()); + return res; + } else { + PatternMatchTranslator translator; + auto regex_pattern = translator(val); + RegexMatcher matcher(regex_pattern); + for (int64_t i = 0; i < cnt; i++) { + auto raw = index->Reverse_Lookup(i); + if (!raw.has_value()) { + res[i] = false; + continue; + } + res[i] = matcher(raw.value()); + } + return res; } - return res; } + PanicInfo(ErrorCode::Unsupported, + "UnaryIndexFuncForMatch is only supported on string types"); } }; @@ -310,15 +329,12 @@ struct UnaryIndexFunc { return index->Range(val, OpType::GreaterEqual); } else if constexpr (op == proto::plan::OpType::LessEqual) { return index->Range(val, OpType::LessEqual); - } else if constexpr (op == proto::plan::OpType::PrefixMatch) { - auto dataset = std::make_unique(); - dataset->Set(milvus::index::OPERATOR_TYPE, - proto::plan::OpType::PrefixMatch); - dataset->Set(milvus::index::PREFIX_VALUE, val); - return index->Query(std::move(dataset)); - } else if constexpr (op == proto::plan::OpType::Match) { + } else if constexpr (op == proto::plan::OpType::PrefixMatch || + op == proto::plan::OpType::Match || + op == proto::plan::OpType::PostfixMatch || + op == proto::plan::OpType::InnerMatch) { UnaryIndexFuncForMatch func; - return func(index, val); + return func(index, val, op); } else { PanicInfo( OpTypeInvalid, diff --git a/internal/core/src/index/BitmapIndex.cpp b/internal/core/src/index/BitmapIndex.cpp index 8734778360..e18dcc9720 100644 --- a/internal/core/src/index/BitmapIndex.cpp +++ b/internal/core/src/index/BitmapIndex.cpp @@ -1235,45 +1235,39 @@ BitmapIndex::Query(const DatasetPtr& dataset) { AssertInfo(is_built_, "index has not been built"); auto op = dataset->Get(OPERATOR_TYPE); - if (op == OpType::PrefixMatch) { - auto prefix = dataset->Get(PREFIX_VALUE); - TargetBitmap res(total_num_rows_, false); - if (is_mmap_) { - for (auto it = bitmap_info_map_.begin(); - it != bitmap_info_map_.end(); - ++it) { - const auto& key = it->first; - if (milvus::query::Match(key, prefix, op)) { - for (const auto& v : it->second) { - res.set(v); - } - } - } - return res; - } - if (build_mode_ == BitmapIndexBuildMode::ROARING) { - for (auto it = data_.begin(); it != data_.end(); ++it) { - const auto& key = it->first; - if (milvus::query::Match(key, prefix, op)) { - for (const auto& v : it->second) { - res.set(v); - } - } - } - } else { - for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) { - const auto& key = it->first; - if (milvus::query::Match(key, prefix, op)) { - res |= it->second; + auto val = dataset->Get(MATCH_VALUE); + TargetBitmap res(total_num_rows_, false); + if (is_mmap_) { + for (auto it = bitmap_info_map_.begin(); it != bitmap_info_map_.end(); + ++it) { + const auto& key = it->first; + if (milvus::query::Match(key, val, op)) { + for (const auto& v : it->second) { + res.set(v); } } } - return res; - } else { - PanicInfo(OpTypeInvalid, - fmt::format("unsupported op_type:{} for bitmap query", op)); } + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + for (auto it = data_.begin(); it != data_.end(); ++it) { + const auto& key = it->first; + if (milvus::query::Match(key, val, op)) { + for (const auto& v : it->second) { + res.set(v); + } + } + } + } else { + for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) { + const auto& key = it->first; + if (milvus::query::Match(key, val, op)) { + res |= it->second; + } + } + } + + return res; } template diff --git a/internal/core/src/index/BitmapIndex.h b/internal/core/src/index/BitmapIndex.h index f146522785..f5afa6e1e4 100644 --- a/internal/core/src/index/BitmapIndex.h +++ b/internal/core/src/index/BitmapIndex.h @@ -138,10 +138,26 @@ class BitmapIndex : public ScalarIndex { } const TargetBitmap - PatternMatch(const std::string& pattern) override { - PatternMatchTranslator translator; - auto regex_pattern = translator(pattern); - return RegexQuery(regex_pattern); + PatternMatch(const std::string& pattern, proto::plan::OpType op) override { + switch (op) { + case proto::plan::OpType::PrefixMatch: + case proto::plan::OpType::PostfixMatch: + case proto::plan::OpType::InnerMatch: { + auto dataset = std::make_unique(); + dataset->Set(milvus::index::OPERATOR_TYPE, op); + dataset->Set(milvus::index::MATCH_VALUE, pattern); + return Query(std::move(dataset)); + } + case proto::plan::OpType::Match: { + PatternMatchTranslator translator; + auto regex_pattern = translator(pattern); + return RegexQuery(regex_pattern); + } + default: + PanicInfo(ErrorCode::OpTypeInvalid, + "not supported op type: {} for index PatterMatch", + op); + } } bool diff --git a/internal/core/src/index/HybridScalarIndex.h b/internal/core/src/index/HybridScalarIndex.h index 3bcc349fea..432c16cc7f 100644 --- a/internal/core/src/index/HybridScalarIndex.h +++ b/internal/core/src/index/HybridScalarIndex.h @@ -105,11 +105,14 @@ class HybridScalarIndex : public ScalarIndex { return internal_index_->Query(dataset); } + bool + SupportPatternMatch() const override { + return internal_index_->SupportPatternMatch(); + } + const TargetBitmap - PatternMatch(const std::string& pattern) override { - PatternMatchTranslator translator; - auto regex_pattern = translator(pattern); - return RegexQuery(regex_pattern); + PatternMatch(const std::string& pattern, proto::plan::OpType op) override { + return internal_index_->PatternMatch(pattern, op); } bool diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index 599fca03ff..eeb5466443 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -423,7 +423,7 @@ const TargetBitmap InvertedIndexTantivy::Query(const DatasetPtr& dataset) { auto op = dataset->Get(OPERATOR_TYPE); if (op == OpType::PrefixMatch) { - auto prefix = dataset->Get(PREFIX_VALUE); + auto prefix = dataset->Get(MATCH_VALUE); return PrefixMatch(prefix); } return ScalarIndex::Query(dataset); diff --git a/internal/core/src/index/InvertedIndexTantivy.h b/internal/core/src/index/InvertedIndexTantivy.h index 6bd08ea68d..059107a49a 100644 --- a/internal/core/src/index/InvertedIndexTantivy.h +++ b/internal/core/src/index/InvertedIndexTantivy.h @@ -190,10 +190,32 @@ class InvertedIndexTantivy : public ScalarIndex { Query(const DatasetPtr& dataset) override; const TargetBitmap - PatternMatch(const std::string& pattern) override { - PatternMatchTranslator translator; - auto regex_pattern = translator(pattern); - return RegexQuery(regex_pattern); + PatternMatch(const std::string& pattern, proto::plan::OpType op) override { + switch (op) { + case proto::plan::OpType::PrefixMatch: { + return PrefixMatch(pattern); + } + case proto::plan::OpType::PostfixMatch: { + PatternMatchTranslator translator; + auto regex_pattern = translator(fmt::format("%{}", pattern)); + return RegexQuery(regex_pattern); + } + case proto::plan::OpType::InnerMatch: { + PatternMatchTranslator translator; + auto regex_pattern = translator(fmt::format("%{}%", pattern)); + return RegexQuery(regex_pattern); + } + case proto::plan::OpType::Match: { + PatternMatchTranslator translator; + auto regex_pattern = translator(pattern); + return RegexQuery(regex_pattern); + } + default: + PanicInfo( + ErrorCode::OpTypeInvalid, + "not supported op type: {} for inverted index PatternMatch", + op); + } } bool diff --git a/internal/core/src/index/Meta.h b/internal/core/src/index/Meta.h index 933faaea52..8010a1c024 100644 --- a/internal/core/src/index/Meta.h +++ b/internal/core/src/index/Meta.h @@ -25,7 +25,7 @@ constexpr const char* LOWER_BOUND_VALUE = "lower_bound_value"; constexpr const char* LOWER_BOUND_INCLUSIVE = "lower_bound_inclusive"; constexpr const char* UPPER_BOUND_VALUE = "upper_bound_value"; constexpr const char* UPPER_BOUND_INCLUSIVE = "upper_bound_inclusive"; -constexpr const char* PREFIX_VALUE = "prefix_value"; +constexpr const char* MATCH_VALUE = "match_value"; // below configurations will be persistent, do not edit them. constexpr const char* MARISA_TRIE_INDEX = "marisa_trie_index"; constexpr const char* MARISA_STR_IDS = "marisa_trie_str_ids"; diff --git a/internal/core/src/index/ScalarIndex.h b/internal/core/src/index/ScalarIndex.h index 0ac613f76a..4dc58b586a 100644 --- a/internal/core/src/index/ScalarIndex.h +++ b/internal/core/src/index/ScalarIndex.h @@ -130,7 +130,7 @@ class ScalarIndex : public IndexBase { } virtual const TargetBitmap - PatternMatch(const std::string& pattern) { + PatternMatch(const std::string& pattern, proto::plan::OpType op) { PanicInfo(Unsupported, "pattern match is not supported"); } diff --git a/internal/core/src/index/StringIndex.h b/internal/core/src/index/StringIndex.h index 3aa84927b1..2a8ae951ac 100644 --- a/internal/core/src/index/StringIndex.h +++ b/internal/core/src/index/StringIndex.h @@ -37,7 +37,7 @@ class StringIndex : public ScalarIndex { Query(const DatasetPtr& dataset) override { auto op = dataset->Get(OPERATOR_TYPE); if (op == OpType::PrefixMatch) { - auto prefix = dataset->Get(PREFIX_VALUE); + auto prefix = dataset->Get(MATCH_VALUE); return PrefixMatch(prefix); } return ScalarIndex::Query(dataset); diff --git a/internal/core/src/index/StringIndexSort.h b/internal/core/src/index/StringIndexSort.h index 30d4343a0b..1fcc3d1260 100644 --- a/internal/core/src/index/StringIndexSort.h +++ b/internal/core/src/index/StringIndexSort.h @@ -31,7 +31,7 @@ class StringIndexSort : public ScalarIndexSort { Query(const DatasetPtr& dataset) override { auto op = dataset->Get(OPERATOR_TYPE); if (op == OpType::PrefixMatch) { - auto prefix = dataset->Get(PREFIX_VALUE); + auto prefix = dataset->Get(MATCH_VALUE); return PrefixMatch(prefix); } return ScalarIndex::Query(dataset); diff --git a/internal/core/src/query/Utils.h b/internal/core/src/query/Utils.h index 8eb535d72d..67657b8575 100644 --- a/internal/core/src/query/Utils.h +++ b/internal/core/src/query/Utils.h @@ -32,6 +32,8 @@ Match(const std::string& str, const std::string& val, OpType op) { return PrefixMatch(str, val); case OpType::PostfixMatch: return PostfixMatch(str, val); + case OpType::InnerMatch: + return InnerMatch(str, val); default: PanicInfo(OpTypeInvalid, "not supported"); } @@ -47,6 +49,8 @@ Match(const std::string_view& str, return PrefixMatch(str, val); case OpType::PostfixMatch: return PostfixMatch(str, val); + case OpType::InnerMatch: + return InnerMatch(str, val); default: PanicInfo(OpTypeInvalid, "not supported"); } diff --git a/internal/core/unittest/test_expr.cpp b/internal/core/unittest/test_expr.cpp index 865e0f33c0..91c83f001a 100644 --- a/internal/core/unittest/test_expr.cpp +++ b/internal/core/unittest/test_expr.cpp @@ -34,6 +34,7 @@ #include "gtest/gtest.h" #include "index/Meta.h" #include "index/JsonInvertedIndex.h" +#include "index/BitmapIndex.h" #include "knowhere/comp/index_param.h" #include "mmap/Types.h" #include "pb/plan.pb.h" @@ -5514,6 +5515,59 @@ TEST_P(ExprTest, TestBinaryArithOpEvalRangeBenchExpr) { } } +TEST(BitmapIndexTest, PatternMatchTest) { + // Initialize bitmap index + using namespace milvus::index; + BitmapIndex index; + + // Add test data + std::vector data = {"apple", "banana", "orange", "pear"}; + + // Build index + index.Build(data.size(), data.data(), nullptr); + + // Create test datasets with different operators + auto prefix_dataset = std::make_shared(); + prefix_dataset->Set(OPERATOR_TYPE, OpType::PrefixMatch); + prefix_dataset->Set(MATCH_VALUE, std::string("a")); + + auto contains_dataset = std::make_shared(); + contains_dataset->Set(OPERATOR_TYPE, OpType::InnerMatch); + contains_dataset->Set(MATCH_VALUE, std::string("an")); + + auto posix_dataset = std::make_shared(); + posix_dataset->Set(OPERATOR_TYPE, OpType::PostfixMatch); + posix_dataset->Set(MATCH_VALUE, std::string("a")); + + // Execute queries + auto prefix_result = index.Query(prefix_dataset); + auto contains_result = index.Query(contains_dataset); + auto posix_result = index.Query(posix_dataset); + + // Verify results + EXPECT_TRUE(prefix_result[0]); + EXPECT_FALSE(prefix_result[2]); + + EXPECT_FALSE(contains_result[0]); + EXPECT_TRUE(contains_result[1]); + EXPECT_TRUE(contains_result[2]); + + EXPECT_FALSE(posix_result[0]); + EXPECT_TRUE(posix_result[1]); + EXPECT_FALSE(posix_result[2]); + + auto prefix_result2 = + index.PatternMatch(std::string("a"), OpType::PrefixMatch); + auto contains_result2 = + index.PatternMatch(std::string("an"), OpType::InnerMatch); + auto posix_result2 = + index.PatternMatch(std::string("a"), OpType::PostfixMatch); + + EXPECT_TRUE(prefix_result == prefix_result2); + EXPECT_TRUE(contains_result == contains_result2); + EXPECT_TRUE(posix_result == posix_result2); +} + TEST(Expr, TestExprNull) { auto schema = std::make_shared(); auto bool_fid = schema->AddDebugField("bool", DataType::BOOL, true); diff --git a/internal/core/unittest/test_inverted_index.cpp b/internal/core/unittest/test_inverted_index.cpp index 9cac17400b..f196f1b79e 100644 --- a/internal/core/unittest/test_inverted_index.cpp +++ b/internal/core/unittest/test_inverted_index.cpp @@ -795,7 +795,7 @@ test_string() { auto dataset = std::make_shared(); auto prefix = data[0]; dataset->Set(index::OPERATOR_TYPE, OpType::PrefixMatch); - dataset->Set(index::PREFIX_VALUE, prefix); + dataset->Set(index::MATCH_VALUE, prefix); auto bitset = real_index->Query(dataset); ASSERT_EQ(cnt, bitset.size()); size_t start = 0; diff --git a/internal/core/unittest/test_regex_query.cpp b/internal/core/unittest/test_regex_query.cpp index 01f2dcd83f..336f85af7b 100644 --- a/internal/core/unittest/test_regex_query.cpp +++ b/internal/core/unittest/test_regex_query.cpp @@ -435,6 +435,66 @@ TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnStlSortStringField) { ASSERT_TRUE(final[4]); } +TEST_F(SealedSegmentRegexQueryTest, PrefixMatchOnInvertedIndexStringField) { + std::string operand = "a"; + const auto& str_meta = schema->operator[](FieldName("str")); + auto column_info = test::GenColumnInfo(str_meta.get_id().get(), + proto::schema::DataType::VarChar, + false, + false); + auto unary_range_expr = + test::GenUnaryRangeExpr(OpType::PrefixMatch, operand); + unary_range_expr->set_allocated_column_info(column_info); + auto expr = test::GenExpr(); + expr->set_allocated_unary_range_expr(unary_range_expr); + + auto parser = ProtoParser(*schema); + auto typed_expr = parser.ParseExprs(*expr); + auto parsed = + std::make_shared(DEFAULT_PLANNODE_ID, typed_expr); + + LoadInvertedIndex(); + + auto segpromote = dynamic_cast(seg.get()); + BitsetType final; + final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP); + ASSERT_FALSE(final[0]); + ASSERT_TRUE(final[1]); + ASSERT_TRUE(final[2]); + ASSERT_TRUE(final[3]); + ASSERT_TRUE(final[4]); +} + +TEST_F(SealedSegmentRegexQueryTest, InnerMatchOnInvertedIndexStringField) { + std::string operand = "a"; + const auto& str_meta = schema->operator[](FieldName("str")); + auto column_info = test::GenColumnInfo(str_meta.get_id().get(), + proto::schema::DataType::VarChar, + false, + false); + auto unary_range_expr = + test::GenUnaryRangeExpr(OpType::InnerMatch, operand); + unary_range_expr->set_allocated_column_info(column_info); + auto expr = test::GenExpr(); + expr->set_allocated_unary_range_expr(unary_range_expr); + + auto parser = ProtoParser(*schema); + auto typed_expr = parser.ParseExprs(*expr); + auto parsed = + std::make_shared(DEFAULT_PLANNODE_ID, typed_expr); + + LoadInvertedIndex(); + + auto segpromote = dynamic_cast(seg.get()); + BitsetType final; + final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP); + ASSERT_FALSE(final[0]); + ASSERT_TRUE(final[1]); + ASSERT_TRUE(final[2]); + ASSERT_TRUE(final[3]); + ASSERT_TRUE(final[4]); +} + TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnInvertedIndexStringField) { std::string operand = "a%"; const auto& str_meta = schema->operator[](FieldName("str")); @@ -464,6 +524,36 @@ TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnInvertedIndexStringField) { ASSERT_TRUE(final[4]); } +TEST_F(SealedSegmentRegexQueryTest, PostfixMatchOnInvertedIndexStringField) { + std::string operand = "a"; + const auto& str_meta = schema->operator[](FieldName("str")); + auto column_info = test::GenColumnInfo(str_meta.get_id().get(), + proto::schema::DataType::VarChar, + false, + false); + auto unary_range_expr = + test::GenUnaryRangeExpr(OpType::PostfixMatch, operand); + unary_range_expr->set_allocated_column_info(column_info); + auto expr = test::GenExpr(); + expr->set_allocated_unary_range_expr(unary_range_expr); + + auto parser = ProtoParser(*schema); + auto typed_expr = parser.ParseExprs(*expr); + auto parsed = + std::make_shared(DEFAULT_PLANNODE_ID, typed_expr); + + LoadInvertedIndex(); + + auto segpromote = dynamic_cast(seg.get()); + BitsetType final; + final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP); + ASSERT_FALSE(final[0]); + ASSERT_FALSE(final[1]); + ASSERT_FALSE(final[2]); + ASSERT_FALSE(final[3]); + ASSERT_FALSE(final[4]); +} + TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnUnsupportedIndex) { std::string operand = "a%"; const auto& str_meta = schema->operator[](FieldName("str")); diff --git a/internal/core/unittest/test_string_index.cpp b/internal/core/unittest/test_string_index.cpp index 20b8002a93..1ddee32fb7 100644 --- a/internal/core/unittest/test_string_index.cpp +++ b/internal/core/unittest/test_string_index.cpp @@ -221,7 +221,7 @@ TEST_F(StringIndexMarisaTest, Query) { auto ds = std::make_shared(); ds->Set(milvus::index::OPERATOR_TYPE, milvus::OpType::PrefixMatch); - ds->Set(milvus::index::PREFIX_VALUE, + ds->Set(milvus::index::MATCH_VALUE, std::move(strs[i])); auto bitset = index->Query(ds); ASSERT_EQ(bitset.size(), strs.size()); diff --git a/internal/core/unittest/test_utils.cpp b/internal/core/unittest/test_utils.cpp index b0e496cb97..6a5fa987b2 100644 --- a/internal/core/unittest/test_utils.cpp +++ b/internal/core/unittest/test_utils.cpp @@ -40,16 +40,39 @@ TEST(Util, StringMatch) { ASSERT_TRUE(PrefixMatch("prefix1", "prefix")); ASSERT_TRUE(PostfixMatch("1postfix", "postfix")); + ASSERT_TRUE(InnerMatch("xxinner1xx", "inner")); ASSERT_TRUE(Match( std::string("prefix1"), std::string("prefix"), OpType::PrefixMatch)); ASSERT_TRUE(Match( std::string("1postfix"), std::string("postfix"), OpType::PostfixMatch)); + ASSERT_TRUE(Match(std::string("xxpostfixxx"), + std::string("postfix"), + OpType::InnerMatch)); ASSERT_FALSE(PrefixMatch("", "longer")); ASSERT_FALSE(PostfixMatch("", "longer")); + ASSERT_FALSE(InnerMatch("", "longer")); ASSERT_FALSE(PrefixMatch("dontmatch", "prefix")); - ASSERT_FALSE(PostfixMatch("dontmatch", "postfix")); + ASSERT_FALSE(InnerMatch("dontmatch", "postfix")); + + ASSERT_TRUE(Match(std::string_view("prefix1"), + std::string("prefix"), + OpType::PrefixMatch)); + + ASSERT_TRUE(Match(std::string_view("1postfix"), + std::string("postfix"), + OpType::PostfixMatch)); + + ASSERT_TRUE(Match(std::string_view("xxpostfixxx"), + std::string("postfix"), + OpType::InnerMatch)); + ASSERT_TRUE( + Match(std::string_view("x"), std::string("x"), OpType::PrefixMatch)); + ASSERT_FALSE( + Match(std::string_view(""), std::string("x"), OpType::InnerMatch)); + ASSERT_TRUE( + Match(std::string_view("x"), std::string(""), OpType::InnerMatch)); } TEST(Util, GetDeleteBitmap) { diff --git a/internal/parser/planparserv2/pattern_match.go b/internal/parser/planparserv2/pattern_match.go index f524e4b369..7eafa0ccc7 100644 --- a/internal/parser/planparserv2/pattern_match.go +++ b/internal/parser/planparserv2/pattern_match.go @@ -13,65 +13,74 @@ var wildcards = map[byte]struct{}{ var escapeCharacter byte = '\\' -// hasWildcards returns true if pattern contains any wildcard. -func hasWildcards(pattern string) (string, bool) { - var result strings.Builder - hasWildcard := false - - for i := 0; i < len(pattern); i++ { - if pattern[i] == escapeCharacter && i+1 < len(pattern) { - next := pattern[i+1] - if _, ok := wildcards[next]; ok { - result.WriteByte(next) - i++ - continue - } - } - - if _, ok := wildcards[pattern[i]]; ok { - hasWildcard = true - } - result.WriteByte(pattern[i]) +func optimizeLikePattern(pattern string) (planpb.OpType, string, bool) { + if len(pattern) == 0 { + return planpb.OpType_Equal, "", true } - return result.String(), hasWildcard -} + if pattern == "%" || pattern == "%%" { + return planpb.OpType_PrefixMatch, "", true + } -// findLastNotOfWildcards find the last location not of last wildcard. -func findLastNotOfWildcards(pattern string) int { - loc := len(pattern) - 1 - for ; loc >= 0; loc-- { - _, ok := wildcards[pattern[loc]] - if !ok { - break - } - if ok { - if loc > 0 && pattern[loc-1] == escapeCharacter { - break + process := func(s string) (string, bool) { + var buf strings.Builder + for i := 0; i < len(s); i++ { + c := s[i] + if c == escapeCharacter && i+1 < len(s) { + next := s[i+1] + if _, ok := wildcards[next]; ok { + buf.WriteByte(next) + i++ + continue + } } + if _, ok := wildcards[c]; ok { + return "", false + } + buf.WriteByte(c) + } + return buf.String(), true + } + + leading := pattern[0] == '%' + trailing := pattern[len(pattern)-1] == '%' + + switch { + case leading && trailing: + inner := pattern[1 : len(pattern)-1] + trimmed := strings.TrimLeft(inner, "%") + trimmed = strings.TrimRight(trimmed, "%") + if subStr, valid := process(trimmed); valid { + // if subStr is empty, it means the pattern is all %, + // return prefix match and empty operand, means all match + if len(subStr) == 0 { + return planpb.OpType_PrefixMatch, "", true + } + return planpb.OpType_InnerMatch, subStr, true + } + case leading: + trimmed := strings.TrimLeft(pattern[1:], "%") + if subStr, valid := process(trimmed); valid { + return planpb.OpType_PostfixMatch, subStr, true + } + case trailing: + trimmed := strings.TrimRight(pattern[:len(pattern)-1], "%") + if subStr, valid := process(trimmed); valid { + return planpb.OpType_PrefixMatch, subStr, true + } + default: + if subStr, valid := process(pattern); valid { + return planpb.OpType_Equal, subStr, true } } - return loc + return planpb.OpType_Invalid, "", false } // translatePatternMatch translates pattern to related op type and operand. func translatePatternMatch(pattern string) (op planpb.OpType, operand string, err error) { - l := len(pattern) - loc := findLastNotOfWildcards(pattern) - - if loc < 0 { - // always match. - return planpb.OpType_PrefixMatch, "", nil - } - - newPattern, exist := hasWildcards(pattern[:loc+1]) - if loc >= l-1 && !exist { - // equal match. - return planpb.OpType_Equal, newPattern, nil - } - if !exist { - // prefix match. - return planpb.OpType_PrefixMatch, newPattern, nil + op, operand, ok := optimizeLikePattern(pattern) + if ok { + return op, operand, nil } return planpb.OpType_Match, pattern, nil diff --git a/internal/parser/planparserv2/pattern_match_test.go b/internal/parser/planparserv2/pattern_match_test.go index d39a4b6bca..a75847e91a 100644 --- a/internal/parser/planparserv2/pattern_match_test.go +++ b/internal/parser/planparserv2/pattern_match_test.go @@ -6,92 +6,6 @@ import ( "github.com/milvus-io/milvus/pkg/v2/proto/planpb" ) -func Test_hasWildcards(t *testing.T) { - type args struct { - pattern string - } - tests := []struct { - name string - args args - want bool - target string - }{ - { - args: args{ - pattern: "no-wildcards", - }, - want: false, - target: "no-wildcards", - }, - { - args: args{ - pattern: "has\\%", - }, - want: false, - target: "has%", - }, - { - args: args{ - pattern: "%", - }, - want: true, - target: "%", - }, - { - args: args{ - pattern: "has%", - }, - want: true, - target: "has%", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - patten, got := hasWildcards(tt.args.pattern) - if got != tt.want || patten != tt.target { - t.Errorf("hasWildcards(%s) = %v, want %v", tt.args.pattern, got, tt.want) - } - }) - } -} - -func Test_findLocOfLastWildcard(t *testing.T) { - type args struct { - pattern string - } - tests := []struct { - name string - args args - want int - }{ - { - args: args{ - pattern: "no-wildcards", - }, - want: 11, - }, - { - args: args{ - pattern: "only\\%", - }, - want: 5, - }, - { - args: args{ - pattern: "prefix%%", - }, - want: 5, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := findLastNotOfWildcards(tt.args.pattern); got != tt.want { - t.Errorf("findLastNotOfWildcards(%s) = %v, want %v", tt.args.pattern, got, tt.want) - } - }) - } -} - func Test_translatePatternMatch(t *testing.T) { type args struct { pattern string @@ -150,3 +64,60 @@ func Test_translatePatternMatch(t *testing.T) { }) } } + +func TestOptimizeLikePattern(t *testing.T) { + tests := []struct { + pattern string + expectedType planpb.OpType + expectedStr string + expectedOk bool + }{ + // inner match + {"%abc%", planpb.OpType_InnerMatch, "abc", true}, + {"%a\\%b%", planpb.OpType_InnerMatch, "a%b", true}, + {"%a\\_b%", planpb.OpType_InnerMatch, "a_b", true}, + {"%a\\\\%", planpb.OpType_InnerMatch, "a\\\\", true}, + {"%a\t%", planpb.OpType_InnerMatch, "a\t", true}, + {"%", planpb.OpType_PrefixMatch, "", true}, + {"%%", planpb.OpType_PrefixMatch, "", true}, + {"%a%b%", planpb.OpType_Invalid, "", false}, + {"%a_b%", planpb.OpType_Invalid, "", false}, + {"%abc\\", planpb.OpType_PostfixMatch, "abc\\", true}, + {"%核心%", planpb.OpType_InnerMatch, "核心", true}, + {"%核%", planpb.OpType_InnerMatch, "核", true}, + {"%\u6838%", planpb.OpType_InnerMatch, "核", true}, + {"%\u6838%", planpb.OpType_InnerMatch, "\u6838", true}, + + // prefix match + {"abc%", planpb.OpType_PrefixMatch, "abc", true}, + {"a\\%bc%", planpb.OpType_PrefixMatch, "a%bc", true}, + {"a\\_bc%", planpb.OpType_PrefixMatch, "a_bc", true}, + {"_abc%", planpb.OpType_Invalid, "", false}, + + // posix match + {"%abc", planpb.OpType_PostfixMatch, "abc", true}, + {"%a\\_bc", planpb.OpType_PostfixMatch, "a_bc", true}, + {"%abc_", planpb.OpType_Invalid, "", false}, + {"%臥蜜", planpb.OpType_PostfixMatch, "臥蜜", true}, + {"%%臥蜜", planpb.OpType_PostfixMatch, "臥蜜", true}, + {"%\u81e5\u871c", planpb.OpType_PostfixMatch, "臥蜜", true}, + + // equal match + {"abc", planpb.OpType_Equal, "abc", true}, + {"a\\%bc", planpb.OpType_Equal, "a%bc", true}, + {"a\\_bc", planpb.OpType_Equal, "a_bc", true}, + {"abc_", planpb.OpType_Invalid, "", false}, + + // null pattern + {"", planpb.OpType_Equal, "", true}, + } + + for _, test := range tests { + actualType, actualStr, actualOk := optimizeLikePattern(test.pattern) + if actualType != test.expectedType || actualStr != test.expectedStr || actualOk != test.expectedOk { + t.Errorf("optimizeLikePattern(%q) = (%q, %q, %v), expected (%q, %q, %v)", + test.pattern, actualType, actualStr, actualOk, + test.expectedType, test.expectedStr, test.expectedOk) + } + } +} diff --git a/pkg/proto/plan.proto b/pkg/proto/plan.proto index 33e05ec510..0228beb961 100644 --- a/pkg/proto/plan.proto +++ b/pkg/proto/plan.proto @@ -20,6 +20,7 @@ enum OpType { NotIn = 12; TextMatch = 13; // text match PhraseMatch = 14; // phrase match + InnerMatch = 15; // substring (e.g., "%value%") }; enum ArithOpType { diff --git a/pkg/proto/planpb/plan.pb.go b/pkg/proto/planpb/plan.pb.go index 2c4dd9f308..eba324a557 100644 --- a/pkg/proto/planpb/plan.pb.go +++ b/pkg/proto/planpb/plan.pb.go @@ -39,6 +39,7 @@ const ( OpType_NotIn OpType = 12 OpType_TextMatch OpType = 13 // text match OpType_PhraseMatch OpType = 14 // phrase match + OpType_InnerMatch OpType = 15 // substring (e.g., "%value%") ) // Enum value maps for OpType. @@ -59,6 +60,7 @@ var ( 12: "NotIn", 13: "TextMatch", 14: "PhraseMatch", + 15: "InnerMatch", } OpType_value = map[string]int32{ "Invalid": 0, @@ -76,6 +78,7 @@ var ( "NotIn": 12, "TextMatch": 13, "PhraseMatch": 14, + "InnerMatch": 15, } ) @@ -2962,7 +2965,7 @@ var file_plan_proto_rawDesc = []byte{ 0x75, 0x74, 0x70, 0x75, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x49, 0x64, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x64, 0x79, 0x6e, 0x61, 0x6d, 0x69, 0x63, 0x5f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0d, 0x64, 0x79, 0x6e, 0x61, 0x6d, 0x69, 0x63, 0x46, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x42, 0x06, 0x0a, 0x04, 0x6e, 0x6f, 0x64, 0x65, 0x2a, 0xda, 0x01, 0x0a, + 0x65, 0x6c, 0x64, 0x73, 0x42, 0x06, 0x0a, 0x04, 0x6e, 0x6f, 0x64, 0x65, 0x2a, 0xea, 0x01, 0x0a, 0x06, 0x4f, 0x70, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x49, 0x6e, 0x76, 0x61, 0x6c, 0x69, 0x64, 0x10, 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x47, 0x72, 0x65, 0x61, 0x74, 0x65, 0x72, 0x54, 0x68, 0x61, 0x6e, 0x10, 0x01, 0x12, 0x10, 0x0a, 0x0c, 0x47, 0x72, 0x65, 0x61, 0x74, 0x65, 0x72, @@ -2976,7 +2979,8 @@ var file_plan_proto_rawDesc = []byte{ 0x61, 0x6e, 0x67, 0x65, 0x10, 0x0a, 0x12, 0x06, 0x0a, 0x02, 0x49, 0x6e, 0x10, 0x0b, 0x12, 0x09, 0x0a, 0x05, 0x4e, 0x6f, 0x74, 0x49, 0x6e, 0x10, 0x0c, 0x12, 0x0d, 0x0a, 0x09, 0x54, 0x65, 0x78, 0x74, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x10, 0x0d, 0x12, 0x0f, 0x0a, 0x0b, 0x50, 0x68, 0x72, 0x61, - 0x73, 0x65, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x10, 0x0e, 0x2a, 0x58, 0x0a, 0x0b, 0x41, 0x72, 0x69, + 0x73, 0x65, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x10, 0x0e, 0x12, 0x0e, 0x0a, 0x0a, 0x49, 0x6e, 0x6e, + 0x65, 0x72, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x10, 0x0f, 0x2a, 0x58, 0x0a, 0x0b, 0x41, 0x72, 0x69, 0x74, 0x68, 0x4f, 0x70, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x41, 0x64, 0x64, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x75, 0x62, 0x10, 0x02, 0x12, 0x07, 0x0a, 0x03, 0x4d, 0x75, 0x6c, 0x10, 0x03, diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index 9c72a64d99..2eb669190a 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -2764,6 +2764,38 @@ class TestQueryString(TestcaseBase): output_fields = [default_int_field_name, default_float_field_name, default_string_field_name] collection_w.query(expression, output_fields=output_fields, check_task=CheckTasks.check_query_results, check_items={exp_res: res}) + + @pytest.mark.tags(CaseLabel.L1) + def test_query_string_expr_with_suffix(self): + """ + target: test query with prefix string expression + method: specify string is primary field, use prefix string expr + expected: verify query successfully + """ + collection_w, vectors = self.init_collection_general(prefix, insert_data=True, + primary_field=ct.default_string_field_name)[0:2] + expression = 'varchar like "%0"' + filtered_data = vectors[0][vectors[0][default_string_field_name].str.endswith('0')] + res = filtered_data.iloc[:, :3].to_dict('records') + output_fields = [default_int_field_name, default_float_field_name, default_string_field_name] + collection_w.query(expression, output_fields=output_fields, + check_task=CheckTasks.check_query_results, check_items={exp_res: res}) + + @pytest.mark.tags(CaseLabel.L1) + def test_query_string_expr_with_inner_match(self): + """ + target: test query with prefix string expression + method: specify string is primary field, use prefix string expr + expected: verify query successfully + """ + collection_w, vectors = self.init_collection_general(prefix, insert_data=True, + primary_field=ct.default_string_field_name)[0:2] + expression = 'varchar like "%0%"' + filtered_data = vectors[0][vectors[0][default_string_field_name].str.contains('0')] + res = filtered_data.iloc[:, :3].to_dict('records') + output_fields = [default_int_field_name, default_float_field_name, default_string_field_name] + collection_w.query(expression, output_fields=output_fields, + check_task=CheckTasks.check_query_results, check_items={exp_res: res}) @pytest.mark.tags(CaseLabel.L1) def test_bitmap_alter_offset_cache_param(self): @@ -2800,9 +2832,10 @@ class TestQueryString(TestcaseBase): collection_w.release() @pytest.mark.tags(CaseLabel.L1) - def test_query_string_expr_with_prefixes_auto_index(self): + @pytest.mark.parametrize("expression", ['varchar like "0%"', 'varchar like "%0"','varchar like "%0%"']) + def test_query_string_expr_with_like_auto_index(self, expression): """ - target: test query with prefix string expression and indexed with auto index + target: test query with like string expression and indexed with auto index expected: verify query successfully """ collection_w, vectors = self.init_collection_general(prefix, insert_data=True, is_index=False, @@ -2812,8 +2845,7 @@ class TestQueryString(TestcaseBase): index_name="query_expr_pre_index") collection_w.create_index("varchar", index_name="varchar_auto_index") time.sleep(1) - collection_w.load() - expression = 'varchar like "0%"' + collection_w.load() result, _ = collection_w.query(expression, output_fields=['varchar']) res_len = len(result) collection_w.release() @@ -2824,7 +2856,8 @@ class TestQueryString(TestcaseBase): assert res_len_1 == res_len @pytest.mark.tags(CaseLabel.L1) - def test_query_string_expr_with_prefixes_bitmap(self): + @pytest.mark.parametrize("expression", ['varchar like "0%"', 'varchar like "%0"','varchar like "%0%"']) + def test_query_string_expr_with_prefixes_bitmap(self, expression): """ target: test query with prefix string expression and indexed with bitmap expected: verify query successfully @@ -2837,7 +2870,6 @@ class TestQueryString(TestcaseBase): collection_w.create_index("varchar", index_name="bitmap_auto_index", index_params={"index_type": "BITMAP"}) time.sleep(1) collection_w.load() - expression = 'varchar like "0%"' result, _ = collection_w.query(expression, output_fields=['varchar']) res_len = len(result) collection_w.release() @@ -2848,7 +2880,8 @@ class TestQueryString(TestcaseBase): assert res_len_1 == res_len @pytest.mark.tags(CaseLabel.L1) - def test_query_string_expr_with_match_auto_index(self): + @pytest.mark.parametrize("expression", ['varchar like "0%"', 'varchar like "%0"','varchar like "%0%"']) + def test_query_string_expr_with_match_auto_index(self, expression): """ target: test query with match string expression and indexed with auto index expected: verify query successfully @@ -2861,7 +2894,6 @@ class TestQueryString(TestcaseBase): collection_w.create_index("varchar", index_name="varchar_auto_index") time.sleep(1) collection_w.load() - expression = 'varchar like "%0%"' result, _ = collection_w.query(expression, output_fields=['varchar']) res_len = len(result) collection_w.release() @@ -3165,6 +3197,80 @@ class TestQueryArray(TestcaseBase): for i in range(len(res)): assert res[i]["id"] == ground_truth[i] + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("use_index", [True, False]) + @pytest.mark.parametrize("index_type", ["INVERTED", "BITMAP"]) + def test_query_array_with_prefix_like(self, use_index, index_type): + # 1. create a collection + schema = cf.gen_array_collection_schema() + collection_w = self.init_collection_wrap(schema=schema) + + # 2. insert data + string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)] + data = cf.gen_array_dataframe_data() + data[ct.default_string_array_field_name] = string_field_value + collection_w.insert(data) + collection_w.create_index(ct.default_float_vec_field_name, {}) + if use_index: + collection_w.create_index(ct.default_string_array_field_name, {"index_type": index_type}) + + # 3. query + collection_w.load() + expression = 'string_array[0] like "0%"' + res = collection_w.query(limit=ct.default_nb, expr=expression)[0] + log.info(res) + filter_data = [row for row in string_field_value if row[0].startswith('0')] + assert len(res) == len(filter_data) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("use_index", [True, False]) + @pytest.mark.parametrize("index_type", ["INVERTED", "BITMAP"]) + def test_query_array_with_suffix_like(self, use_index, index_type): + # 1. create a collection + schema = cf.gen_array_collection_schema() + collection_w = self.init_collection_wrap(schema=schema) + + # 2. insert data + string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)] + data = cf.gen_array_dataframe_data() + data[ct.default_string_array_field_name] = string_field_value + collection_w.insert(data) + collection_w.create_index(ct.default_float_vec_field_name, {}) + if use_index: + collection_w.create_index(ct.default_string_array_field_name, {"index_type": index_type}) + + # 3. query + collection_w.load() + expression = 'string_array[0] like "%0"' + res = collection_w.query(limit=ct.default_nb, expr=expression)[0] + log.info(res) + filter_data = [row for row in string_field_value if row[0].endswith('0')] + assert len(res) == len(filter_data) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("use_index", [True, False]) + @pytest.mark.parametrize("index_type", ["INVERTED", "BITMAP"]) + def test_query_array_with_inner_like(self, use_index, index_type): + # 1. create a collection + schema = cf.gen_array_collection_schema() + collection_w = self.init_collection_wrap(schema=schema) + + # 2. insert data + string_field_value = [[str(j) for j in range(i, i + 3)] for i in range(ct.default_nb)] + data = cf.gen_array_dataframe_data() + data[ct.default_string_array_field_name] = string_field_value + collection_w.insert(data) + collection_w.create_index(ct.default_float_vec_field_name, {}) + if use_index: + collection_w.create_index(ct.default_string_array_field_name, {"index_type": index_type}) + + # 3. query + collection_w.load() + expression = 'string_array[0] like "%0%"' + res = collection_w.query(limit=ct.default_nb, expr=expression)[0] + log.info(res) + filter_data = [row for row in string_field_value if '0' in row[0]] + assert len(res) == len(filter_data) class TestQueryCount(TestcaseBase): """