enhance: STL_SORT to support LIKE operator (#46534)

issue: https://github.com/milvus-io/milvus/issues/44399

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

## Release Notes

* **New Features**
* Enhanced pattern matching for string indexes with support for prefix,
postfix, inner, and regex-based matching operations.
* Optimized pattern matching performance through prefix-based filtering
and range-based lookups.

* **Tests**
* Added comprehensive test coverage for pattern matching functionality
across multiple index implementations.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
This commit is contained in:
Buqian Zheng 2025-12-24 19:45:20 +08:00 committed by GitHub
parent 9ba0c4e501
commit 6ac66e38d1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 775 additions and 21 deletions

View File

@ -60,4 +60,28 @@ translate_pattern_match_to_regex(const std::string& pattern) {
}
return r;
}
std::string
extract_fixed_prefix_from_pattern(const std::string& pattern) {
std::string prefix;
prefix.reserve(pattern.size());
bool escape_mode = false;
for (char c : pattern) {
if (escape_mode) {
prefix += c;
escape_mode = false;
} else {
if (c == '\\') {
escape_mode = true;
} else if (c == '%' || c == '_') {
break; // stop at first wildcard
} else {
prefix += c;
}
}
}
return prefix;
}
} // namespace milvus

View File

@ -73,4 +73,10 @@ inline bool
RegexMatcher::operator()(const std::string_view& operand) {
return boost::regex_match(operand.begin(), operand.end(), r_);
}
// Extract fixed prefix from LIKE pattern (before first % or _)
// Examples: "abc%def" -> "abc", "ab_cd%" -> "ab", "%abc" -> ""
std::string
extract_fixed_prefix_from_pattern(const std::string& pattern);
} // namespace milvus

View File

@ -151,3 +151,79 @@ TEST(RegexMatcherTest, PatternMatchWithNewLine) {
EXPECT_TRUE(matcher(std::string("Hello\n")));
}
// ============== extract_fixed_prefix_from_pattern Tests ==============
TEST(ExtractFixedPrefixTest, SimplePrefix) {
using namespace milvus;
// Pattern "abc%" -> prefix "abc"
EXPECT_EQ(extract_fixed_prefix_from_pattern("abc%"), "abc");
// Pattern "abc%def" -> prefix "abc"
EXPECT_EQ(extract_fixed_prefix_from_pattern("abc%def"), "abc");
// Pattern "hello%world%" -> prefix "hello"
EXPECT_EQ(extract_fixed_prefix_from_pattern("hello%world%"), "hello");
}
TEST(ExtractFixedPrefixTest, UnderscoreWildcard) {
using namespace milvus;
// Pattern "a_c" -> prefix "a" (stops at _)
EXPECT_EQ(extract_fixed_prefix_from_pattern("a_c"), "a");
// Pattern "ab_cd%" -> prefix "ab"
EXPECT_EQ(extract_fixed_prefix_from_pattern("ab_cd%"), "ab");
// Pattern "_abc" -> prefix "" (starts with _)
EXPECT_EQ(extract_fixed_prefix_from_pattern("_abc"), "");
}
TEST(ExtractFixedPrefixTest, NoPrefix) {
using namespace milvus;
// Pattern "%abc" -> prefix ""
EXPECT_EQ(extract_fixed_prefix_from_pattern("%abc"), "");
// Pattern "%abc%" -> prefix ""
EXPECT_EQ(extract_fixed_prefix_from_pattern("%abc%"), "");
// Pattern "%" -> prefix ""
EXPECT_EQ(extract_fixed_prefix_from_pattern("%"), "");
// Pattern "_" -> prefix ""
EXPECT_EQ(extract_fixed_prefix_from_pattern("_"), "");
}
TEST(ExtractFixedPrefixTest, EscapedPercent) {
using namespace milvus;
// Pattern "100\%" -> prefix "100%" (escaped % is literal)
EXPECT_EQ(extract_fixed_prefix_from_pattern("100\\%"), "100%");
// Pattern "a\%b%" -> prefix "a%b"
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\%b%"), "a%b");
// Pattern "100\%\%" -> prefix "100%%"
EXPECT_EQ(extract_fixed_prefix_from_pattern("100\\%\\%"), "100%%");
}
TEST(ExtractFixedPrefixTest, EscapedUnderscore) {
using namespace milvus;
// Pattern "a\_b" -> prefix "a_b" (escaped _ is literal)
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b"), "a_b");
// Pattern "a\_b%" -> prefix "a_b"
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b%"), "a_b");
// Pattern "a\_b_c" -> prefix "a_b" (stops at unescaped _)
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b_c"), "a_b");
}
TEST(ExtractFixedPrefixTest, MixedEscape) {
using namespace milvus;
// Pattern "10\%\_off%" -> prefix "10%_off"
EXPECT_EQ(extract_fixed_prefix_from_pattern("10\\%\\_off%"), "10%_off");
// Pattern "a\%b\_c%d" -> prefix "a%b_c"
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\%b\\_c%d"), "a%b_c");
}
TEST(ExtractFixedPrefixTest, NoWildcard) {
using namespace milvus;
// Pattern "abc" -> prefix "abc" (no wildcard)
EXPECT_EQ(extract_fixed_prefix_from_pattern("abc"), "abc");
// Pattern "hello world" -> prefix "hello world"
EXPECT_EQ(extract_fixed_prefix_from_pattern("hello world"), "hello world");
}
TEST(ExtractFixedPrefixTest, EmptyPattern) {
using namespace milvus;
// Empty pattern -> empty prefix
EXPECT_EQ(extract_fixed_prefix_from_pattern(""), "");
}

View File

@ -25,6 +25,7 @@
#include <filesystem>
#include "storage/FileWriter.h"
#include "common/CDataType.h"
#include "common/RegexQuery.h"
#include "knowhere/log.h"
#include "index/Meta.h"
#include "common/Utils.h"
@ -389,6 +390,29 @@ StringIndexSort::PrefixMatch(const std::string_view prefix) {
return impl_->PrefixMatch(prefix, total_num_rows_);
}
const TargetBitmap
StringIndexSort::PatternMatch(const std::string& pattern,
proto::plan::OpType op) {
assert(impl_ != nullptr);
if (op == proto::plan::OpType::PrefixMatch) {
return PrefixMatch(pattern);
}
// Support Match, PostfixMatch, InnerMatch
// All can benefit from unique value deduplication
if (op != proto::plan::OpType::Match &&
op != proto::plan::OpType::PostfixMatch &&
op != proto::plan::OpType::InnerMatch) {
ThrowInfo(Unsupported,
"StringIndexSort::PatternMatch only supports Match, "
"PrefixMatch, PostfixMatch, InnerMatch, got op: {}",
static_cast<int>(op));
}
return impl_->PatternMatch(pattern, op, total_num_rows_);
}
std::optional<std::string>
StringIndexSort::Reverse_Lookup(size_t offset) const {
assert(impl_ != nullptr);
@ -822,20 +846,122 @@ StringIndexSortMemoryImpl::PrefixMatch(const std::string_view prefix,
size_t total_num_rows) {
TargetBitmap bitset(total_num_rows, false);
auto it = std::lower_bound(
unique_values_.begin(), unique_values_.end(), std::string(prefix));
// Use FindPrefixRange for O(log n) lookup of both start and end
auto [start_idx, end_idx] = FindPrefixRange(std::string(prefix));
size_t idx = std::distance(unique_values_.begin(), it);
while (idx < unique_values_.size()) {
if (!milvus::PrefixMatch(unique_values_[idx], prefix)) {
break;
}
for (size_t idx = start_idx; idx < end_idx; ++idx) {
const auto& posting_list = posting_lists_[idx];
for (uint32_t row_id : posting_list) {
bitset[row_id] = true;
}
++idx;
}
return bitset;
}
std::pair<size_t, size_t>
StringIndexSortMemoryImpl::FindPrefixRange(const std::string& prefix) const {
if (prefix.empty()) {
return {0, unique_values_.size()};
}
// Binary search for start: first value >= prefix
auto start_it =
std::lower_bound(unique_values_.begin(), unique_values_.end(), prefix);
size_t start_idx = std::distance(unique_values_.begin(), start_it);
// Compute "next prefix" for end boundary: "abc" -> "abd"
// Range is [prefix, next_prefix), all strings starting with prefix
std::string next_prefix = prefix;
bool has_next = false;
// Find rightmost char that can be incremented (not 0xFF)
for (int i = next_prefix.size() - 1; i >= 0; --i) {
if (static_cast<unsigned char>(next_prefix[i]) < 255) {
++next_prefix[i];
next_prefix.resize(i + 1);
has_next = true;
break;
}
}
size_t end_idx;
if (has_next) {
// Binary search for end: first value >= next_prefix
auto end_it = std::lower_bound(
unique_values_.begin(), unique_values_.end(), next_prefix);
end_idx = std::distance(unique_values_.begin(), end_it);
} else {
// All chars are 0xFF, no upper bound
end_idx = unique_values_.size();
}
return {start_idx, end_idx};
}
bool
StringIndexSortMemoryImpl::MatchValue(const std::string& value,
const std::string& pattern,
proto::plan::OpType op) const {
switch (op) {
case proto::plan::OpType::PostfixMatch:
// Suffix match: value ends with pattern
if (pattern.size() > value.size()) {
return false;
}
return value.compare(value.size() - pattern.size(),
pattern.size(),
pattern) == 0;
case proto::plan::OpType::InnerMatch:
// Contains match: value contains pattern
return value.find(pattern) != std::string::npos;
default:
// For Match op, use regex (handled separately)
return false;
}
}
const TargetBitmap
StringIndexSortMemoryImpl::PatternMatch(const std::string& pattern,
proto::plan::OpType op,
size_t total_num_rows) {
TargetBitmap bitset(total_num_rows, false);
// For PostfixMatch and InnerMatch, no prefix optimization possible
// Still benefits from unique value deduplication
if (op == proto::plan::OpType::PostfixMatch ||
op == proto::plan::OpType::InnerMatch) {
// Iterate over all unique values
for (size_t idx = 0; idx < unique_values_.size(); ++idx) {
if (MatchValue(unique_values_[idx], pattern, op)) {
const auto& posting_list = posting_lists_[idx];
for (uint32_t row_id : posting_list) {
bitset[row_id] = true;
}
}
}
return bitset;
}
// For Match op, use prefix optimization + regex
std::string prefix = extract_fixed_prefix_from_pattern(pattern);
// Find the range of unique values to check
auto [start_idx, end_idx] = FindPrefixRange(prefix);
// Build regex matcher
PatternMatchTranslator translator;
auto regex_pattern = translator(pattern);
RegexMatcher matcher(regex_pattern);
// Iterate over unique values in range (each value checked only once)
for (size_t idx = start_idx; idx < end_idx; ++idx) {
if (matcher(unique_values_[idx])) {
// Match found, set all row IDs in posting list
const auto& posting_list = posting_lists_[idx];
for (uint32_t row_id : posting_list) {
bitset[row_id] = true;
}
}
}
return bitset;
@ -1149,23 +1275,113 @@ StringIndexSortMmapImpl::PrefixMatch(const std::string_view prefix,
size_t total_num_rows) {
TargetBitmap bitset(total_num_rows, false);
// Find the first string that is >= prefix
size_t idx = LowerBound(prefix);
// Use FindPrefixRange for O(log n) lookup of both start and end
auto [start_idx, end_idx] = FindPrefixRange(std::string(prefix));
while (idx < unique_count_) {
for (size_t idx = start_idx; idx < end_idx; ++idx) {
MmapEntry entry = GetEntry(idx);
std::string_view entry_sv = entry.get_string_view();
if (entry_sv.size() < prefix.size() ||
entry_sv.substr(0, prefix.size()) != prefix) {
break;
}
// Add all row_ids for this matching string
entry.for_each_row_id(
[&bitset](uint32_t row_id) { bitset.set(row_id); });
}
++idx;
return bitset;
}
std::pair<size_t, size_t>
StringIndexSortMmapImpl::FindPrefixRange(const std::string& prefix) const {
if (prefix.empty()) {
return {0, unique_count_};
}
// Binary search for start
size_t start_idx = LowerBound(prefix);
// Compute "next prefix" for end boundary: "abc" -> "abd"
std::string next_prefix = prefix;
bool has_next = false;
for (int i = next_prefix.size() - 1; i >= 0; --i) {
if (static_cast<unsigned char>(next_prefix[i]) < 255) {
++next_prefix[i];
next_prefix.resize(i + 1);
has_next = true;
break;
}
}
size_t end_idx;
if (has_next) {
end_idx = LowerBound(next_prefix);
} else {
end_idx = unique_count_;
}
return {start_idx, end_idx};
}
bool
StringIndexSortMmapImpl::MatchValue(const std::string& value,
const std::string& pattern,
proto::plan::OpType op) const {
switch (op) {
case proto::plan::OpType::PostfixMatch:
// Suffix match: value ends with pattern
if (pattern.size() > value.size()) {
return false;
}
return value.compare(value.size() - pattern.size(),
pattern.size(),
pattern) == 0;
case proto::plan::OpType::InnerMatch:
// Contains match: value contains pattern
return value.find(pattern) != std::string::npos;
default:
return false;
}
}
const TargetBitmap
StringIndexSortMmapImpl::PatternMatch(const std::string& pattern,
proto::plan::OpType op,
size_t total_num_rows) {
TargetBitmap bitset(total_num_rows, false);
// For PostfixMatch and InnerMatch, no prefix optimization possible
// Still benefits from unique value deduplication
if (op == proto::plan::OpType::PostfixMatch ||
op == proto::plan::OpType::InnerMatch) {
for (size_t idx = 0; idx < unique_count_; ++idx) {
MmapEntry entry = GetEntry(idx);
std::string_view sv = entry.get_string_view();
if (MatchValue(std::string(sv), pattern, op)) {
entry.for_each_row_id(
[&bitset](uint32_t row_id) { bitset.set(row_id); });
}
}
return bitset;
}
// For Match op, use prefix optimization + regex
std::string prefix = extract_fixed_prefix_from_pattern(pattern);
// Find the range of unique values to check
auto [start_idx, end_idx] = FindPrefixRange(prefix);
// Build regex matcher
PatternMatchTranslator translator;
auto regex_pattern = translator(pattern);
RegexMatcher matcher(regex_pattern);
// Iterate over unique values in range (each value checked only once)
for (size_t idx = start_idx; idx < end_idx; ++idx) {
MmapEntry entry = GetEntry(idx);
std::string_view sv = entry.get_string_view();
if (matcher(sv)) {
// Match found, set all row IDs in posting list
entry.for_each_row_id(
[&bitset](uint32_t row_id) { bitset.set(row_id); });
}
}
return bitset;

View File

@ -117,6 +117,14 @@ class StringIndexSort : public StringIndex {
const TargetBitmap
PrefixMatch(const std::string_view prefix) override;
bool
SupportPatternMatch() const override {
return true;
}
const TargetBitmap
PatternMatch(const std::string& pattern, proto::plan::OpType op) override;
std::optional<std::string>
Reverse_Lookup(size_t offset) const override;
@ -196,6 +204,11 @@ class StringIndexSortImpl {
virtual const TargetBitmap
PrefixMatch(const std::string_view prefix, size_t total_num_rows) = 0;
virtual const TargetBitmap
PatternMatch(const std::string& pattern,
proto::plan::OpType op,
size_t total_num_rows) = 0;
virtual std::optional<std::string>
Reverse_Lookup(size_t offset,
size_t total_num_rows,
@ -273,6 +286,11 @@ class StringIndexSortMemoryImpl : public StringIndexSortImpl {
const TargetBitmap
PrefixMatch(const std::string_view prefix, size_t total_num_rows) override;
const TargetBitmap
PatternMatch(const std::string& pattern,
proto::plan::OpType op,
size_t total_num_rows) override;
std::optional<std::string>
Reverse_Lookup(size_t offset,
size_t total_num_rows,
@ -290,6 +308,16 @@ class StringIndexSortMemoryImpl : public StringIndexSortImpl {
size_t
FindValueIndex(const std::string& value) const;
// Helper to find the range of unique values that start with a prefix
std::pair<size_t, size_t>
FindPrefixRange(const std::string& prefix) const;
// Check if value matches pattern based on op type
bool
MatchValue(const std::string& value,
const std::string& pattern,
proto::plan::OpType op) const;
void
BuildFromMap(std::map<std::string, PostingList>&& unique_map,
size_t total_num_rows,
@ -390,6 +418,11 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl {
const TargetBitmap
PrefixMatch(const std::string_view prefix, size_t total_num_rows) override;
const TargetBitmap
PatternMatch(const std::string& pattern,
proto::plan::OpType op,
size_t total_num_rows) override;
std::optional<std::string>
Reverse_Lookup(size_t offset,
size_t total_num_rows,
@ -407,6 +440,12 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl {
size_t
FindValueIndex(const std::string& value) const;
// Check if value matches pattern based on op type
bool
MatchValue(const std::string& value,
const std::string& pattern,
proto::plan::OpType op) const;
// Binary search helpers
size_t
LowerBound(const std::string_view& value) const;
@ -414,6 +453,10 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl {
size_t
UpperBound(const std::string_view& value) const;
// Find the range [start, end) of unique values that start with a prefix
std::pair<size_t, size_t>
FindPrefixRange(const std::string& prefix) const;
MmapEntry
GetEntry(size_t idx) const {
const uint8_t* str_ptr = string_data_start_ + string_offsets_[idx];

View File

@ -3,6 +3,7 @@
#include "index/StringIndexSort.h"
#include "index/IndexFactory.h"
#include "pb/plan.pb.h"
#include "test_utils/indexbuilder_test_utils.h"
constexpr int64_t nb = 100;
@ -605,3 +606,391 @@ TEST(StringIndexSortStandaloneTest, StringIndexSortSerialization) {
ASSERT_GT(bitset.count(), 0);
}
}
// ============== PatternMatch Tests ==============
using milvus::proto::plan::OpType;
TEST(StringIndexSortPatternMatchTest, PatternMatchBasicMemory) {
std::vector<std::string> test_data = {
"apple", // 0
"application", // 1
"apply", // 2
"banana", // 3
"band", // 4
"cat", // 5
"category", // 6
"dog", // 7
"application", // 8 (duplicate)
"apple" // 9 (duplicate)
};
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Test pattern "app%" - should match apple, application, apply
{
auto bitset = index->PatternMatch("app%", OpType::Match);
ASSERT_EQ(bitset.count(), 5); // apple(2), application(2), apply(1)
ASSERT_TRUE(bitset[0]); // apple
ASSERT_TRUE(bitset[1]); // application
ASSERT_TRUE(bitset[2]); // apply
ASSERT_TRUE(bitset[8]); // application (dup)
ASSERT_TRUE(bitset[9]); // apple (dup)
}
// Test pattern "app%ion" - should match application only
{
auto bitset = index->PatternMatch("app%ion", OpType::Match);
ASSERT_EQ(bitset.count(), 2); // application appears twice
ASSERT_TRUE(bitset[1]); // application
ASSERT_TRUE(bitset[8]); // application (dup)
}
// Test pattern "%ana%" - should match banana
{
auto bitset = index->PatternMatch("%ana%", OpType::Match);
ASSERT_EQ(bitset.count(), 1);
ASSERT_TRUE(bitset[3]); // banana
}
// Test pattern "cat%" - should match cat, category
{
auto bitset = index->PatternMatch("cat%", OpType::Match);
ASSERT_EQ(bitset.count(), 2);
ASSERT_TRUE(bitset[5]); // cat
ASSERT_TRUE(bitset[6]); // category
}
}
TEST(StringIndexSortPatternMatchTest, PatternMatchWithUnderscoreMemory) {
std::vector<std::string> test_data = {
"abc", // 0
"aXc", // 1
"a1c", // 2
"abcd", // 3
"ac", // 4
"abbc", // 5
};
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Test pattern "a_c" - matches any single character between a and c
{
auto bitset = index->PatternMatch("a_c", OpType::Match);
ASSERT_EQ(bitset.count(), 3); // abc, aXc, a1c
ASSERT_TRUE(bitset[0]); // abc
ASSERT_TRUE(bitset[1]); // aXc
ASSERT_TRUE(bitset[2]); // a1c
ASSERT_FALSE(bitset[3]); // abcd - too long
ASSERT_FALSE(bitset[4]); // ac - too short
ASSERT_FALSE(bitset[5]); // abbc - two chars between
}
// Test pattern "a_c%" - prefix with underscore
{
auto bitset = index->PatternMatch("a_c%", OpType::Match);
ASSERT_EQ(bitset.count(), 4); // abc, aXc, a1c, abcd
ASSERT_TRUE(bitset[0]); // abc
ASSERT_TRUE(bitset[1]); // aXc
ASSERT_TRUE(bitset[2]); // a1c
ASSERT_TRUE(bitset[3]); // abcd
}
}
TEST(StringIndexSortPatternMatchTest, PatternMatchEscapeMemory) {
std::vector<std::string> test_data = {
"100%", // 0 - contains literal %
"100percent", // 1
"50%off", // 2 - contains literal %
"a_b", // 3 - contains literal _
"axb", // 4
"a%b", // 5 - contains literal %
"ab", // 6
};
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Test pattern "100\%" - matches literal "100%"
{
auto bitset = index->PatternMatch("100\\%", OpType::Match);
ASSERT_EQ(bitset.count(), 1);
ASSERT_TRUE(bitset[0]); // 100%
}
// Test pattern "%\%%" - matches strings containing literal %
{
auto bitset = index->PatternMatch("%\\%%", OpType::Match);
ASSERT_EQ(bitset.count(), 3);
ASSERT_TRUE(bitset[0]); // 100%
ASSERT_TRUE(bitset[2]); // 50%off
ASSERT_TRUE(bitset[5]); // a%b
}
// Test pattern "a\_b" - matches literal "a_b"
{
auto bitset = index->PatternMatch("a\\_b", OpType::Match);
ASSERT_EQ(bitset.count(), 1);
ASSERT_TRUE(bitset[3]); // a_b
}
// Test pattern "a\%b" - matches literal "a%b"
{
auto bitset = index->PatternMatch("a\\%b", OpType::Match);
ASSERT_EQ(bitset.count(), 1);
ASSERT_TRUE(bitset[5]); // a%b
}
}
TEST(StringIndexSortPatternMatchTest, PatternMatchNoPrefix) {
std::vector<std::string> test_data = {
"hello world", // 0
"world hello", // 1
"hello", // 2
"world", // 3
"say hello", // 4
};
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Test pattern "%hello" - postfix match (no fixed prefix)
{
auto bitset = index->PatternMatch("%hello", OpType::Match);
ASSERT_EQ(bitset.count(), 3);
ASSERT_TRUE(bitset[1]); // world hello
ASSERT_TRUE(bitset[2]); // hello
ASSERT_TRUE(bitset[4]); // say hello
}
// Test pattern "%world%" - inner match (no fixed prefix)
{
auto bitset = index->PatternMatch("%world%", OpType::Match);
ASSERT_EQ(bitset.count(), 3);
ASSERT_TRUE(bitset[0]); // hello world
ASSERT_TRUE(bitset[1]); // world hello
ASSERT_TRUE(bitset[3]); // world
}
}
TEST(StringIndexSortPatternMatchTest, PatternMatchMmap) {
std::vector<std::string> test_data = {
"apple",
"application",
"apply",
"banana",
"band",
};
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Serialize and reload with mmap
auto binary_set = index->Serialize(config);
milvus::Config mmap_config;
mmap_config[milvus::index::MMAP_FILE_PATH] =
"/tmp/test_pattern_match_mmap.idx";
auto mmap_index = milvus::index::CreateStringIndexSort({});
mmap_index->Load(binary_set, mmap_config);
// Test pattern "app%ion"
{
auto bitset = mmap_index->PatternMatch("app%ion", OpType::Match);
ASSERT_EQ(bitset.count(), 1);
ASSERT_TRUE(bitset[1]); // application
}
// Test pattern "ban%" with underscore
{
auto bitset = mmap_index->PatternMatch("ban%", OpType::Match);
ASSERT_EQ(bitset.count(), 2); // banana, band
ASSERT_TRUE(bitset[3]);
ASSERT_TRUE(bitset[4]);
}
std::remove("/tmp/test_pattern_match_mmap.idx");
}
TEST(StringIndexSortPatternMatchTest, PatternMatchComplexEscape) {
std::vector<std::string> test_data = {
"10%_off", // 0 - contains both % and _
"10%aoff", // 1
"10%boff", // 2
"10a_off", // 3
"discount", // 4
};
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Test pattern "10\%\_off" - matches literal "10%_off"
{
auto bitset = index->PatternMatch("10\\%\\_off", OpType::Match);
ASSERT_EQ(bitset.count(), 1);
ASSERT_TRUE(bitset[0]);
}
// Test pattern "10\%_off" - matches "10%" followed by any single char and "off"
{
auto bitset = index->PatternMatch("10\\%_off", OpType::Match);
ASSERT_EQ(bitset.count(), 3); // 10%_off, 10%aoff, 10%boff
ASSERT_TRUE(bitset[0]);
ASSERT_TRUE(bitset[1]);
ASSERT_TRUE(bitset[2]);
}
}
TEST(StringIndexSortPatternMatchTest, PatternMatchPrefixOp) {
std::vector<std::string> test_data = {
"apple",
"application",
"banana",
};
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Test PrefixMatch op - should delegate to PrefixMatch
auto bitset = index->PatternMatch("app", OpType::PrefixMatch);
ASSERT_EQ(bitset.count(), 2); // apple, application
ASSERT_TRUE(bitset[0]);
ASSERT_TRUE(bitset[1]);
}
TEST(StringIndexSortPatternMatchTest, PatternMatchDuplicateValues) {
// Test that duplicate values are handled correctly
// (each unique value should only be regex-matched once)
std::vector<std::string> test_data;
for (int i = 0; i < 1000; ++i) {
test_data.push_back("repeated_value");
}
test_data.push_back("other_value");
milvus::Config config;
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Pattern that matches repeated_value
auto bitset = index->PatternMatch("repeated%", OpType::Match);
ASSERT_EQ(bitset.count(), 1000);
// Pattern that matches other_value
auto bitset2 = index->PatternMatch("other%", OpType::Match);
ASSERT_EQ(bitset2.count(), 1);
ASSERT_TRUE(bitset2[1000]);
}
TEST(StringIndexSortPatternMatchTest, PostfixMatch) {
std::vector<std::string> test_data = {
"hello_world", // 0 - ends with "world"
"new_world", // 1 - ends with "world"
"world_peace", // 2 - ends with "peace"
"hello", // 3
"world", // 4 - ends with "world"
};
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// PostfixMatch: find strings ending with "world"
auto bitset = index->PatternMatch("world", OpType::PostfixMatch);
ASSERT_EQ(bitset.count(), 3);
ASSERT_TRUE(bitset[0]); // hello_world
ASSERT_TRUE(bitset[1]); // new_world
ASSERT_FALSE(bitset[2]); // world_peace
ASSERT_FALSE(bitset[3]); // hello
ASSERT_TRUE(bitset[4]); // world
// PostfixMatch: find strings ending with "peace"
auto bitset2 = index->PatternMatch("peace", OpType::PostfixMatch);
ASSERT_EQ(bitset2.count(), 1);
ASSERT_TRUE(bitset2[2]); // world_peace
}
TEST(StringIndexSortPatternMatchTest, InnerMatch) {
std::vector<std::string> test_data = {
"hello_world", // 0 - contains "world"
"new_world", // 1 - contains "world"
"world_peace", // 2 - contains "world"
"hello", // 3 - no "world"
"worldwide", // 4 - contains "world"
};
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// InnerMatch: find strings containing "world"
auto bitset = index->PatternMatch("world", OpType::InnerMatch);
ASSERT_EQ(bitset.count(), 4);
ASSERT_TRUE(bitset[0]); // hello_world
ASSERT_TRUE(bitset[1]); // new_world
ASSERT_TRUE(bitset[2]); // world_peace
ASSERT_FALSE(bitset[3]); // hello
ASSERT_TRUE(bitset[4]); // worldwide
// InnerMatch: find strings containing "ello"
auto bitset2 = index->PatternMatch("ello", OpType::InnerMatch);
ASSERT_EQ(bitset2.count(), 2);
ASSERT_TRUE(bitset2[0]); // hello_world
ASSERT_FALSE(bitset2[1]);
ASSERT_FALSE(bitset2[2]);
ASSERT_TRUE(bitset2[3]); // hello
ASSERT_FALSE(bitset2[4]);
}
TEST(StringIndexSortPatternMatchTest, PostfixMatchMmap) {
std::vector<std::string> test_data = {
"application", // 0 - ends with "ion"
"revolution", // 1 - ends with "ion"
"apple", // 2
};
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Serialize and reload as mmap
auto binaryset = index->Serialize({});
auto mmap_index = milvus::index::CreateStringIndexSort({});
mmap_index->Load(binaryset, {});
// PostfixMatch on mmap
auto bitset = mmap_index->PatternMatch("ion", OpType::PostfixMatch);
ASSERT_EQ(bitset.count(), 2);
ASSERT_TRUE(bitset[0]); // application
ASSERT_TRUE(bitset[1]); // revolution
ASSERT_FALSE(bitset[2]); // apple
}
TEST(StringIndexSortPatternMatchTest, InnerMatchMmap) {
std::vector<std::string> test_data = {
"application", // 0 - contains "cat"
"category", // 1 - contains "cat"
"dog", // 2 - no "cat"
};
auto index = milvus::index::CreateStringIndexSort({});
index->Build(test_data.size(), test_data.data());
// Serialize and reload as mmap
auto binaryset = index->Serialize({});
auto mmap_index = milvus::index::CreateStringIndexSort({});
mmap_index->Load(binaryset, {});
// InnerMatch on mmap
auto bitset = mmap_index->PatternMatch("cat", OpType::InnerMatch);
ASSERT_EQ(bitset.count(), 2);
ASSERT_TRUE(bitset[0]); // application
ASSERT_TRUE(bitset[1]); // category
ASSERT_FALSE(bitset[2]); // dog
}