diff --git a/internal/core/src/common/RegexQuery.cpp b/internal/core/src/common/RegexQuery.cpp index 9fe99022de..7d14cbf00c 100644 --- a/internal/core/src/common/RegexQuery.cpp +++ b/internal/core/src/common/RegexQuery.cpp @@ -60,4 +60,28 @@ translate_pattern_match_to_regex(const std::string& pattern) { } return r; } + +std::string +extract_fixed_prefix_from_pattern(const std::string& pattern) { + std::string prefix; + prefix.reserve(pattern.size()); + bool escape_mode = false; + + for (char c : pattern) { + if (escape_mode) { + prefix += c; + escape_mode = false; + } else { + if (c == '\\') { + escape_mode = true; + } else if (c == '%' || c == '_') { + break; // stop at first wildcard + } else { + prefix += c; + } + } + } + return prefix; +} + } // namespace milvus diff --git a/internal/core/src/common/RegexQuery.h b/internal/core/src/common/RegexQuery.h index efce0a82d8..9de12eef61 100644 --- a/internal/core/src/common/RegexQuery.h +++ b/internal/core/src/common/RegexQuery.h @@ -73,4 +73,10 @@ inline bool RegexMatcher::operator()(const std::string_view& operand) { return boost::regex_match(operand.begin(), operand.end(), r_); } + +// Extract fixed prefix from LIKE pattern (before first % or _) +// Examples: "abc%def" -> "abc", "ab_cd%" -> "ab", "%abc" -> "" +std::string +extract_fixed_prefix_from_pattern(const std::string& pattern); + } // namespace milvus diff --git a/internal/core/src/common/RegexQueryUtilTest.cpp b/internal/core/src/common/RegexQueryUtilTest.cpp index f087600a5b..a5dcfa030c 100644 --- a/internal/core/src/common/RegexQueryUtilTest.cpp +++ b/internal/core/src/common/RegexQueryUtilTest.cpp @@ -151,3 +151,79 @@ TEST(RegexMatcherTest, PatternMatchWithNewLine) { EXPECT_TRUE(matcher(std::string("Hello\n"))); } + +// ============== extract_fixed_prefix_from_pattern Tests ============== + +TEST(ExtractFixedPrefixTest, SimplePrefix) { + using namespace milvus; + // Pattern "abc%" -> prefix "abc" + EXPECT_EQ(extract_fixed_prefix_from_pattern("abc%"), "abc"); + // Pattern "abc%def" -> prefix "abc" + EXPECT_EQ(extract_fixed_prefix_from_pattern("abc%def"), "abc"); + // Pattern "hello%world%" -> prefix "hello" + EXPECT_EQ(extract_fixed_prefix_from_pattern("hello%world%"), "hello"); +} + +TEST(ExtractFixedPrefixTest, UnderscoreWildcard) { + using namespace milvus; + // Pattern "a_c" -> prefix "a" (stops at _) + EXPECT_EQ(extract_fixed_prefix_from_pattern("a_c"), "a"); + // Pattern "ab_cd%" -> prefix "ab" + EXPECT_EQ(extract_fixed_prefix_from_pattern("ab_cd%"), "ab"); + // Pattern "_abc" -> prefix "" (starts with _) + EXPECT_EQ(extract_fixed_prefix_from_pattern("_abc"), ""); +} + +TEST(ExtractFixedPrefixTest, NoPrefix) { + using namespace milvus; + // Pattern "%abc" -> prefix "" + EXPECT_EQ(extract_fixed_prefix_from_pattern("%abc"), ""); + // Pattern "%abc%" -> prefix "" + EXPECT_EQ(extract_fixed_prefix_from_pattern("%abc%"), ""); + // Pattern "%" -> prefix "" + EXPECT_EQ(extract_fixed_prefix_from_pattern("%"), ""); + // Pattern "_" -> prefix "" + EXPECT_EQ(extract_fixed_prefix_from_pattern("_"), ""); +} + +TEST(ExtractFixedPrefixTest, EscapedPercent) { + using namespace milvus; + // Pattern "100\%" -> prefix "100%" (escaped % is literal) + EXPECT_EQ(extract_fixed_prefix_from_pattern("100\\%"), "100%"); + // Pattern "a\%b%" -> prefix "a%b" + EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\%b%"), "a%b"); + // Pattern "100\%\%" -> prefix "100%%" + EXPECT_EQ(extract_fixed_prefix_from_pattern("100\\%\\%"), "100%%"); +} + +TEST(ExtractFixedPrefixTest, EscapedUnderscore) { + using namespace milvus; + // Pattern "a\_b" -> prefix "a_b" (escaped _ is literal) + EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b"), "a_b"); + // Pattern "a\_b%" -> prefix "a_b" + EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b%"), "a_b"); + // Pattern "a\_b_c" -> prefix "a_b" (stops at unescaped _) + EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b_c"), "a_b"); +} + +TEST(ExtractFixedPrefixTest, MixedEscape) { + using namespace milvus; + // Pattern "10\%\_off%" -> prefix "10%_off" + EXPECT_EQ(extract_fixed_prefix_from_pattern("10\\%\\_off%"), "10%_off"); + // Pattern "a\%b\_c%d" -> prefix "a%b_c" + EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\%b\\_c%d"), "a%b_c"); +} + +TEST(ExtractFixedPrefixTest, NoWildcard) { + using namespace milvus; + // Pattern "abc" -> prefix "abc" (no wildcard) + EXPECT_EQ(extract_fixed_prefix_from_pattern("abc"), "abc"); + // Pattern "hello world" -> prefix "hello world" + EXPECT_EQ(extract_fixed_prefix_from_pattern("hello world"), "hello world"); +} + +TEST(ExtractFixedPrefixTest, EmptyPattern) { + using namespace milvus; + // Empty pattern -> empty prefix + EXPECT_EQ(extract_fixed_prefix_from_pattern(""), ""); +} diff --git a/internal/core/src/index/StringIndexSort.cpp b/internal/core/src/index/StringIndexSort.cpp index 00b32dd3f1..b0de62ef96 100644 --- a/internal/core/src/index/StringIndexSort.cpp +++ b/internal/core/src/index/StringIndexSort.cpp @@ -25,6 +25,7 @@ #include #include "storage/FileWriter.h" #include "common/CDataType.h" +#include "common/RegexQuery.h" #include "knowhere/log.h" #include "index/Meta.h" #include "common/Utils.h" @@ -389,6 +390,29 @@ StringIndexSort::PrefixMatch(const std::string_view prefix) { return impl_->PrefixMatch(prefix, total_num_rows_); } +const TargetBitmap +StringIndexSort::PatternMatch(const std::string& pattern, + proto::plan::OpType op) { + assert(impl_ != nullptr); + + if (op == proto::plan::OpType::PrefixMatch) { + return PrefixMatch(pattern); + } + + // Support Match, PostfixMatch, InnerMatch + // All can benefit from unique value deduplication + if (op != proto::plan::OpType::Match && + op != proto::plan::OpType::PostfixMatch && + op != proto::plan::OpType::InnerMatch) { + ThrowInfo(Unsupported, + "StringIndexSort::PatternMatch only supports Match, " + "PrefixMatch, PostfixMatch, InnerMatch, got op: {}", + static_cast(op)); + } + + return impl_->PatternMatch(pattern, op, total_num_rows_); +} + std::optional StringIndexSort::Reverse_Lookup(size_t offset) const { assert(impl_ != nullptr); @@ -822,20 +846,122 @@ StringIndexSortMemoryImpl::PrefixMatch(const std::string_view prefix, size_t total_num_rows) { TargetBitmap bitset(total_num_rows, false); - auto it = std::lower_bound( - unique_values_.begin(), unique_values_.end(), std::string(prefix)); + // Use FindPrefixRange for O(log n) lookup of both start and end + auto [start_idx, end_idx] = FindPrefixRange(std::string(prefix)); - size_t idx = std::distance(unique_values_.begin(), it); - - while (idx < unique_values_.size()) { - if (!milvus::PrefixMatch(unique_values_[idx], prefix)) { - break; - } + for (size_t idx = start_idx; idx < end_idx; ++idx) { const auto& posting_list = posting_lists_[idx]; for (uint32_t row_id : posting_list) { bitset[row_id] = true; } - ++idx; + } + + return bitset; +} + +std::pair +StringIndexSortMemoryImpl::FindPrefixRange(const std::string& prefix) const { + if (prefix.empty()) { + return {0, unique_values_.size()}; + } + + // Binary search for start: first value >= prefix + auto start_it = + std::lower_bound(unique_values_.begin(), unique_values_.end(), prefix); + size_t start_idx = std::distance(unique_values_.begin(), start_it); + + // Compute "next prefix" for end boundary: "abc" -> "abd" + // Range is [prefix, next_prefix), all strings starting with prefix + std::string next_prefix = prefix; + bool has_next = false; + // Find rightmost char that can be incremented (not 0xFF) + for (int i = next_prefix.size() - 1; i >= 0; --i) { + if (static_cast(next_prefix[i]) < 255) { + ++next_prefix[i]; + next_prefix.resize(i + 1); + has_next = true; + break; + } + } + + size_t end_idx; + if (has_next) { + // Binary search for end: first value >= next_prefix + auto end_it = std::lower_bound( + unique_values_.begin(), unique_values_.end(), next_prefix); + end_idx = std::distance(unique_values_.begin(), end_it); + } else { + // All chars are 0xFF, no upper bound + end_idx = unique_values_.size(); + } + + return {start_idx, end_idx}; +} + +bool +StringIndexSortMemoryImpl::MatchValue(const std::string& value, + const std::string& pattern, + proto::plan::OpType op) const { + switch (op) { + case proto::plan::OpType::PostfixMatch: + // Suffix match: value ends with pattern + if (pattern.size() > value.size()) { + return false; + } + return value.compare(value.size() - pattern.size(), + pattern.size(), + pattern) == 0; + case proto::plan::OpType::InnerMatch: + // Contains match: value contains pattern + return value.find(pattern) != std::string::npos; + default: + // For Match op, use regex (handled separately) + return false; + } +} + +const TargetBitmap +StringIndexSortMemoryImpl::PatternMatch(const std::string& pattern, + proto::plan::OpType op, + size_t total_num_rows) { + TargetBitmap bitset(total_num_rows, false); + + // For PostfixMatch and InnerMatch, no prefix optimization possible + // Still benefits from unique value deduplication + if (op == proto::plan::OpType::PostfixMatch || + op == proto::plan::OpType::InnerMatch) { + // Iterate over all unique values + for (size_t idx = 0; idx < unique_values_.size(); ++idx) { + if (MatchValue(unique_values_[idx], pattern, op)) { + const auto& posting_list = posting_lists_[idx]; + for (uint32_t row_id : posting_list) { + bitset[row_id] = true; + } + } + } + return bitset; + } + + // For Match op, use prefix optimization + regex + std::string prefix = extract_fixed_prefix_from_pattern(pattern); + + // Find the range of unique values to check + auto [start_idx, end_idx] = FindPrefixRange(prefix); + + // Build regex matcher + PatternMatchTranslator translator; + auto regex_pattern = translator(pattern); + RegexMatcher matcher(regex_pattern); + + // Iterate over unique values in range (each value checked only once) + for (size_t idx = start_idx; idx < end_idx; ++idx) { + if (matcher(unique_values_[idx])) { + // Match found, set all row IDs in posting list + const auto& posting_list = posting_lists_[idx]; + for (uint32_t row_id : posting_list) { + bitset[row_id] = true; + } + } } return bitset; @@ -1149,23 +1275,113 @@ StringIndexSortMmapImpl::PrefixMatch(const std::string_view prefix, size_t total_num_rows) { TargetBitmap bitset(total_num_rows, false); - // Find the first string that is >= prefix - size_t idx = LowerBound(prefix); + // Use FindPrefixRange for O(log n) lookup of both start and end + auto [start_idx, end_idx] = FindPrefixRange(std::string(prefix)); - while (idx < unique_count_) { + for (size_t idx = start_idx; idx < end_idx; ++idx) { MmapEntry entry = GetEntry(idx); - std::string_view entry_sv = entry.get_string_view(); - - if (entry_sv.size() < prefix.size() || - entry_sv.substr(0, prefix.size()) != prefix) { - break; - } - - // Add all row_ids for this matching string entry.for_each_row_id( [&bitset](uint32_t row_id) { bitset.set(row_id); }); + } - ++idx; + return bitset; +} + +std::pair +StringIndexSortMmapImpl::FindPrefixRange(const std::string& prefix) const { + if (prefix.empty()) { + return {0, unique_count_}; + } + + // Binary search for start + size_t start_idx = LowerBound(prefix); + + // Compute "next prefix" for end boundary: "abc" -> "abd" + std::string next_prefix = prefix; + bool has_next = false; + for (int i = next_prefix.size() - 1; i >= 0; --i) { + if (static_cast(next_prefix[i]) < 255) { + ++next_prefix[i]; + next_prefix.resize(i + 1); + has_next = true; + break; + } + } + + size_t end_idx; + if (has_next) { + end_idx = LowerBound(next_prefix); + } else { + end_idx = unique_count_; + } + + return {start_idx, end_idx}; +} + +bool +StringIndexSortMmapImpl::MatchValue(const std::string& value, + const std::string& pattern, + proto::plan::OpType op) const { + switch (op) { + case proto::plan::OpType::PostfixMatch: + // Suffix match: value ends with pattern + if (pattern.size() > value.size()) { + return false; + } + return value.compare(value.size() - pattern.size(), + pattern.size(), + pattern) == 0; + case proto::plan::OpType::InnerMatch: + // Contains match: value contains pattern + return value.find(pattern) != std::string::npos; + default: + return false; + } +} + +const TargetBitmap +StringIndexSortMmapImpl::PatternMatch(const std::string& pattern, + proto::plan::OpType op, + size_t total_num_rows) { + TargetBitmap bitset(total_num_rows, false); + + // For PostfixMatch and InnerMatch, no prefix optimization possible + // Still benefits from unique value deduplication + if (op == proto::plan::OpType::PostfixMatch || + op == proto::plan::OpType::InnerMatch) { + for (size_t idx = 0; idx < unique_count_; ++idx) { + MmapEntry entry = GetEntry(idx); + std::string_view sv = entry.get_string_view(); + + if (MatchValue(std::string(sv), pattern, op)) { + entry.for_each_row_id( + [&bitset](uint32_t row_id) { bitset.set(row_id); }); + } + } + return bitset; + } + + // For Match op, use prefix optimization + regex + std::string prefix = extract_fixed_prefix_from_pattern(pattern); + + // Find the range of unique values to check + auto [start_idx, end_idx] = FindPrefixRange(prefix); + + // Build regex matcher + PatternMatchTranslator translator; + auto regex_pattern = translator(pattern); + RegexMatcher matcher(regex_pattern); + + // Iterate over unique values in range (each value checked only once) + for (size_t idx = start_idx; idx < end_idx; ++idx) { + MmapEntry entry = GetEntry(idx); + std::string_view sv = entry.get_string_view(); + + if (matcher(sv)) { + // Match found, set all row IDs in posting list + entry.for_each_row_id( + [&bitset](uint32_t row_id) { bitset.set(row_id); }); + } } return bitset; diff --git a/internal/core/src/index/StringIndexSort.h b/internal/core/src/index/StringIndexSort.h index 998aadfcd5..7c9c62a23c 100644 --- a/internal/core/src/index/StringIndexSort.h +++ b/internal/core/src/index/StringIndexSort.h @@ -117,6 +117,14 @@ class StringIndexSort : public StringIndex { const TargetBitmap PrefixMatch(const std::string_view prefix) override; + bool + SupportPatternMatch() const override { + return true; + } + + const TargetBitmap + PatternMatch(const std::string& pattern, proto::plan::OpType op) override; + std::optional Reverse_Lookup(size_t offset) const override; @@ -196,6 +204,11 @@ class StringIndexSortImpl { virtual const TargetBitmap PrefixMatch(const std::string_view prefix, size_t total_num_rows) = 0; + virtual const TargetBitmap + PatternMatch(const std::string& pattern, + proto::plan::OpType op, + size_t total_num_rows) = 0; + virtual std::optional Reverse_Lookup(size_t offset, size_t total_num_rows, @@ -273,6 +286,11 @@ class StringIndexSortMemoryImpl : public StringIndexSortImpl { const TargetBitmap PrefixMatch(const std::string_view prefix, size_t total_num_rows) override; + const TargetBitmap + PatternMatch(const std::string& pattern, + proto::plan::OpType op, + size_t total_num_rows) override; + std::optional Reverse_Lookup(size_t offset, size_t total_num_rows, @@ -290,6 +308,16 @@ class StringIndexSortMemoryImpl : public StringIndexSortImpl { size_t FindValueIndex(const std::string& value) const; + // Helper to find the range of unique values that start with a prefix + std::pair + FindPrefixRange(const std::string& prefix) const; + + // Check if value matches pattern based on op type + bool + MatchValue(const std::string& value, + const std::string& pattern, + proto::plan::OpType op) const; + void BuildFromMap(std::map&& unique_map, size_t total_num_rows, @@ -390,6 +418,11 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl { const TargetBitmap PrefixMatch(const std::string_view prefix, size_t total_num_rows) override; + const TargetBitmap + PatternMatch(const std::string& pattern, + proto::plan::OpType op, + size_t total_num_rows) override; + std::optional Reverse_Lookup(size_t offset, size_t total_num_rows, @@ -407,6 +440,12 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl { size_t FindValueIndex(const std::string& value) const; + // Check if value matches pattern based on op type + bool + MatchValue(const std::string& value, + const std::string& pattern, + proto::plan::OpType op) const; + // Binary search helpers size_t LowerBound(const std::string_view& value) const; @@ -414,6 +453,10 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl { size_t UpperBound(const std::string_view& value) const; + // Find the range [start, end) of unique values that start with a prefix + std::pair + FindPrefixRange(const std::string& prefix) const; + MmapEntry GetEntry(size_t idx) const { const uint8_t* str_ptr = string_data_start_ + string_offsets_[idx]; diff --git a/internal/core/src/index/StringIndexSortTest.cpp b/internal/core/src/index/StringIndexSortTest.cpp index d280536666..4672037063 100644 --- a/internal/core/src/index/StringIndexSortTest.cpp +++ b/internal/core/src/index/StringIndexSortTest.cpp @@ -3,6 +3,7 @@ #include "index/StringIndexSort.h" #include "index/IndexFactory.h" +#include "pb/plan.pb.h" #include "test_utils/indexbuilder_test_utils.h" constexpr int64_t nb = 100; @@ -605,3 +606,391 @@ TEST(StringIndexSortStandaloneTest, StringIndexSortSerialization) { ASSERT_GT(bitset.count(), 0); } } + +// ============== PatternMatch Tests ============== + +using milvus::proto::plan::OpType; + +TEST(StringIndexSortPatternMatchTest, PatternMatchBasicMemory) { + std::vector test_data = { + "apple", // 0 + "application", // 1 + "apply", // 2 + "banana", // 3 + "band", // 4 + "cat", // 5 + "category", // 6 + "dog", // 7 + "application", // 8 (duplicate) + "apple" // 9 (duplicate) + }; + + milvus::Config config; + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Test pattern "app%" - should match apple, application, apply + { + auto bitset = index->PatternMatch("app%", OpType::Match); + ASSERT_EQ(bitset.count(), 5); // apple(2), application(2), apply(1) + ASSERT_TRUE(bitset[0]); // apple + ASSERT_TRUE(bitset[1]); // application + ASSERT_TRUE(bitset[2]); // apply + ASSERT_TRUE(bitset[8]); // application (dup) + ASSERT_TRUE(bitset[9]); // apple (dup) + } + + // Test pattern "app%ion" - should match application only + { + auto bitset = index->PatternMatch("app%ion", OpType::Match); + ASSERT_EQ(bitset.count(), 2); // application appears twice + ASSERT_TRUE(bitset[1]); // application + ASSERT_TRUE(bitset[8]); // application (dup) + } + + // Test pattern "%ana%" - should match banana + { + auto bitset = index->PatternMatch("%ana%", OpType::Match); + ASSERT_EQ(bitset.count(), 1); + ASSERT_TRUE(bitset[3]); // banana + } + + // Test pattern "cat%" - should match cat, category + { + auto bitset = index->PatternMatch("cat%", OpType::Match); + ASSERT_EQ(bitset.count(), 2); + ASSERT_TRUE(bitset[5]); // cat + ASSERT_TRUE(bitset[6]); // category + } +} + +TEST(StringIndexSortPatternMatchTest, PatternMatchWithUnderscoreMemory) { + std::vector test_data = { + "abc", // 0 + "aXc", // 1 + "a1c", // 2 + "abcd", // 3 + "ac", // 4 + "abbc", // 5 + }; + + milvus::Config config; + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Test pattern "a_c" - matches any single character between a and c + { + auto bitset = index->PatternMatch("a_c", OpType::Match); + ASSERT_EQ(bitset.count(), 3); // abc, aXc, a1c + ASSERT_TRUE(bitset[0]); // abc + ASSERT_TRUE(bitset[1]); // aXc + ASSERT_TRUE(bitset[2]); // a1c + ASSERT_FALSE(bitset[3]); // abcd - too long + ASSERT_FALSE(bitset[4]); // ac - too short + ASSERT_FALSE(bitset[5]); // abbc - two chars between + } + + // Test pattern "a_c%" - prefix with underscore + { + auto bitset = index->PatternMatch("a_c%", OpType::Match); + ASSERT_EQ(bitset.count(), 4); // abc, aXc, a1c, abcd + ASSERT_TRUE(bitset[0]); // abc + ASSERT_TRUE(bitset[1]); // aXc + ASSERT_TRUE(bitset[2]); // a1c + ASSERT_TRUE(bitset[3]); // abcd + } +} + +TEST(StringIndexSortPatternMatchTest, PatternMatchEscapeMemory) { + std::vector test_data = { + "100%", // 0 - contains literal % + "100percent", // 1 + "50%off", // 2 - contains literal % + "a_b", // 3 - contains literal _ + "axb", // 4 + "a%b", // 5 - contains literal % + "ab", // 6 + }; + + milvus::Config config; + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Test pattern "100\%" - matches literal "100%" + { + auto bitset = index->PatternMatch("100\\%", OpType::Match); + ASSERT_EQ(bitset.count(), 1); + ASSERT_TRUE(bitset[0]); // 100% + } + + // Test pattern "%\%%" - matches strings containing literal % + { + auto bitset = index->PatternMatch("%\\%%", OpType::Match); + ASSERT_EQ(bitset.count(), 3); + ASSERT_TRUE(bitset[0]); // 100% + ASSERT_TRUE(bitset[2]); // 50%off + ASSERT_TRUE(bitset[5]); // a%b + } + + // Test pattern "a\_b" - matches literal "a_b" + { + auto bitset = index->PatternMatch("a\\_b", OpType::Match); + ASSERT_EQ(bitset.count(), 1); + ASSERT_TRUE(bitset[3]); // a_b + } + + // Test pattern "a\%b" - matches literal "a%b" + { + auto bitset = index->PatternMatch("a\\%b", OpType::Match); + ASSERT_EQ(bitset.count(), 1); + ASSERT_TRUE(bitset[5]); // a%b + } +} + +TEST(StringIndexSortPatternMatchTest, PatternMatchNoPrefix) { + std::vector test_data = { + "hello world", // 0 + "world hello", // 1 + "hello", // 2 + "world", // 3 + "say hello", // 4 + }; + + milvus::Config config; + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Test pattern "%hello" - postfix match (no fixed prefix) + { + auto bitset = index->PatternMatch("%hello", OpType::Match); + ASSERT_EQ(bitset.count(), 3); + ASSERT_TRUE(bitset[1]); // world hello + ASSERT_TRUE(bitset[2]); // hello + ASSERT_TRUE(bitset[4]); // say hello + } + + // Test pattern "%world%" - inner match (no fixed prefix) + { + auto bitset = index->PatternMatch("%world%", OpType::Match); + ASSERT_EQ(bitset.count(), 3); + ASSERT_TRUE(bitset[0]); // hello world + ASSERT_TRUE(bitset[1]); // world hello + ASSERT_TRUE(bitset[3]); // world + } +} + +TEST(StringIndexSortPatternMatchTest, PatternMatchMmap) { + std::vector test_data = { + "apple", + "application", + "apply", + "banana", + "band", + }; + + milvus::Config config; + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Serialize and reload with mmap + auto binary_set = index->Serialize(config); + + milvus::Config mmap_config; + mmap_config[milvus::index::MMAP_FILE_PATH] = + "/tmp/test_pattern_match_mmap.idx"; + + auto mmap_index = milvus::index::CreateStringIndexSort({}); + mmap_index->Load(binary_set, mmap_config); + + // Test pattern "app%ion" + { + auto bitset = mmap_index->PatternMatch("app%ion", OpType::Match); + ASSERT_EQ(bitset.count(), 1); + ASSERT_TRUE(bitset[1]); // application + } + + // Test pattern "ban%" with underscore + { + auto bitset = mmap_index->PatternMatch("ban%", OpType::Match); + ASSERT_EQ(bitset.count(), 2); // banana, band + ASSERT_TRUE(bitset[3]); + ASSERT_TRUE(bitset[4]); + } + + std::remove("/tmp/test_pattern_match_mmap.idx"); +} + +TEST(StringIndexSortPatternMatchTest, PatternMatchComplexEscape) { + std::vector test_data = { + "10%_off", // 0 - contains both % and _ + "10%aoff", // 1 + "10%boff", // 2 + "10a_off", // 3 + "discount", // 4 + }; + + milvus::Config config; + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Test pattern "10\%\_off" - matches literal "10%_off" + { + auto bitset = index->PatternMatch("10\\%\\_off", OpType::Match); + ASSERT_EQ(bitset.count(), 1); + ASSERT_TRUE(bitset[0]); + } + + // Test pattern "10\%_off" - matches "10%" followed by any single char and "off" + { + auto bitset = index->PatternMatch("10\\%_off", OpType::Match); + ASSERT_EQ(bitset.count(), 3); // 10%_off, 10%aoff, 10%boff + ASSERT_TRUE(bitset[0]); + ASSERT_TRUE(bitset[1]); + ASSERT_TRUE(bitset[2]); + } +} + +TEST(StringIndexSortPatternMatchTest, PatternMatchPrefixOp) { + std::vector test_data = { + "apple", + "application", + "banana", + }; + + milvus::Config config; + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Test PrefixMatch op - should delegate to PrefixMatch + auto bitset = index->PatternMatch("app", OpType::PrefixMatch); + ASSERT_EQ(bitset.count(), 2); // apple, application + ASSERT_TRUE(bitset[0]); + ASSERT_TRUE(bitset[1]); +} + +TEST(StringIndexSortPatternMatchTest, PatternMatchDuplicateValues) { + // Test that duplicate values are handled correctly + // (each unique value should only be regex-matched once) + std::vector test_data; + for (int i = 0; i < 1000; ++i) { + test_data.push_back("repeated_value"); + } + test_data.push_back("other_value"); + + milvus::Config config; + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Pattern that matches repeated_value + auto bitset = index->PatternMatch("repeated%", OpType::Match); + ASSERT_EQ(bitset.count(), 1000); + + // Pattern that matches other_value + auto bitset2 = index->PatternMatch("other%", OpType::Match); + ASSERT_EQ(bitset2.count(), 1); + ASSERT_TRUE(bitset2[1000]); +} + +TEST(StringIndexSortPatternMatchTest, PostfixMatch) { + std::vector test_data = { + "hello_world", // 0 - ends with "world" + "new_world", // 1 - ends with "world" + "world_peace", // 2 - ends with "peace" + "hello", // 3 + "world", // 4 - ends with "world" + }; + + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // PostfixMatch: find strings ending with "world" + auto bitset = index->PatternMatch("world", OpType::PostfixMatch); + ASSERT_EQ(bitset.count(), 3); + ASSERT_TRUE(bitset[0]); // hello_world + ASSERT_TRUE(bitset[1]); // new_world + ASSERT_FALSE(bitset[2]); // world_peace + ASSERT_FALSE(bitset[3]); // hello + ASSERT_TRUE(bitset[4]); // world + + // PostfixMatch: find strings ending with "peace" + auto bitset2 = index->PatternMatch("peace", OpType::PostfixMatch); + ASSERT_EQ(bitset2.count(), 1); + ASSERT_TRUE(bitset2[2]); // world_peace +} + +TEST(StringIndexSortPatternMatchTest, InnerMatch) { + std::vector test_data = { + "hello_world", // 0 - contains "world" + "new_world", // 1 - contains "world" + "world_peace", // 2 - contains "world" + "hello", // 3 - no "world" + "worldwide", // 4 - contains "world" + }; + + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // InnerMatch: find strings containing "world" + auto bitset = index->PatternMatch("world", OpType::InnerMatch); + ASSERT_EQ(bitset.count(), 4); + ASSERT_TRUE(bitset[0]); // hello_world + ASSERT_TRUE(bitset[1]); // new_world + ASSERT_TRUE(bitset[2]); // world_peace + ASSERT_FALSE(bitset[3]); // hello + ASSERT_TRUE(bitset[4]); // worldwide + + // InnerMatch: find strings containing "ello" + auto bitset2 = index->PatternMatch("ello", OpType::InnerMatch); + ASSERT_EQ(bitset2.count(), 2); + ASSERT_TRUE(bitset2[0]); // hello_world + ASSERT_FALSE(bitset2[1]); + ASSERT_FALSE(bitset2[2]); + ASSERT_TRUE(bitset2[3]); // hello + ASSERT_FALSE(bitset2[4]); +} + +TEST(StringIndexSortPatternMatchTest, PostfixMatchMmap) { + std::vector test_data = { + "application", // 0 - ends with "ion" + "revolution", // 1 - ends with "ion" + "apple", // 2 + }; + + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Serialize and reload as mmap + auto binaryset = index->Serialize({}); + auto mmap_index = milvus::index::CreateStringIndexSort({}); + mmap_index->Load(binaryset, {}); + + // PostfixMatch on mmap + auto bitset = mmap_index->PatternMatch("ion", OpType::PostfixMatch); + ASSERT_EQ(bitset.count(), 2); + ASSERT_TRUE(bitset[0]); // application + ASSERT_TRUE(bitset[1]); // revolution + ASSERT_FALSE(bitset[2]); // apple +} + +TEST(StringIndexSortPatternMatchTest, InnerMatchMmap) { + std::vector test_data = { + "application", // 0 - contains "cat" + "category", // 1 - contains "cat" + "dog", // 2 - no "cat" + }; + + auto index = milvus::index::CreateStringIndexSort({}); + index->Build(test_data.size(), test_data.data()); + + // Serialize and reload as mmap + auto binaryset = index->Serialize({}); + auto mmap_index = milvus::index::CreateStringIndexSort({}); + mmap_index->Load(binaryset, {}); + + // InnerMatch on mmap + auto bitset = mmap_index->PatternMatch("cat", OpType::InnerMatch); + ASSERT_EQ(bitset.count(), 2); + ASSERT_TRUE(bitset[0]); // application + ASSERT_TRUE(bitset[1]); // category + ASSERT_FALSE(bitset[2]); // dog +}