mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 14:35:27 +08:00
enhance: STL_SORT to support LIKE operator (#46534)
issue: https://github.com/milvus-io/milvus/issues/44399 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit ## Release Notes * **New Features** * Enhanced pattern matching for string indexes with support for prefix, postfix, inner, and regex-based matching operations. * Optimized pattern matching performance through prefix-based filtering and range-based lookups. * **Tests** * Added comprehensive test coverage for pattern matching functionality across multiple index implementations. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
This commit is contained in:
parent
9ba0c4e501
commit
6ac66e38d1
@ -60,4 +60,28 @@ translate_pattern_match_to_regex(const std::string& pattern) {
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
std::string
|
||||
extract_fixed_prefix_from_pattern(const std::string& pattern) {
|
||||
std::string prefix;
|
||||
prefix.reserve(pattern.size());
|
||||
bool escape_mode = false;
|
||||
|
||||
for (char c : pattern) {
|
||||
if (escape_mode) {
|
||||
prefix += c;
|
||||
escape_mode = false;
|
||||
} else {
|
||||
if (c == '\\') {
|
||||
escape_mode = true;
|
||||
} else if (c == '%' || c == '_') {
|
||||
break; // stop at first wildcard
|
||||
} else {
|
||||
prefix += c;
|
||||
}
|
||||
}
|
||||
}
|
||||
return prefix;
|
||||
}
|
||||
|
||||
} // namespace milvus
|
||||
|
||||
@ -73,4 +73,10 @@ inline bool
|
||||
RegexMatcher::operator()(const std::string_view& operand) {
|
||||
return boost::regex_match(operand.begin(), operand.end(), r_);
|
||||
}
|
||||
|
||||
// Extract fixed prefix from LIKE pattern (before first % or _)
|
||||
// Examples: "abc%def" -> "abc", "ab_cd%" -> "ab", "%abc" -> ""
|
||||
std::string
|
||||
extract_fixed_prefix_from_pattern(const std::string& pattern);
|
||||
|
||||
} // namespace milvus
|
||||
|
||||
@ -151,3 +151,79 @@ TEST(RegexMatcherTest, PatternMatchWithNewLine) {
|
||||
|
||||
EXPECT_TRUE(matcher(std::string("Hello\n")));
|
||||
}
|
||||
|
||||
// ============== extract_fixed_prefix_from_pattern Tests ==============
|
||||
|
||||
TEST(ExtractFixedPrefixTest, SimplePrefix) {
|
||||
using namespace milvus;
|
||||
// Pattern "abc%" -> prefix "abc"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("abc%"), "abc");
|
||||
// Pattern "abc%def" -> prefix "abc"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("abc%def"), "abc");
|
||||
// Pattern "hello%world%" -> prefix "hello"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("hello%world%"), "hello");
|
||||
}
|
||||
|
||||
TEST(ExtractFixedPrefixTest, UnderscoreWildcard) {
|
||||
using namespace milvus;
|
||||
// Pattern "a_c" -> prefix "a" (stops at _)
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("a_c"), "a");
|
||||
// Pattern "ab_cd%" -> prefix "ab"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("ab_cd%"), "ab");
|
||||
// Pattern "_abc" -> prefix "" (starts with _)
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("_abc"), "");
|
||||
}
|
||||
|
||||
TEST(ExtractFixedPrefixTest, NoPrefix) {
|
||||
using namespace milvus;
|
||||
// Pattern "%abc" -> prefix ""
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("%abc"), "");
|
||||
// Pattern "%abc%" -> prefix ""
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("%abc%"), "");
|
||||
// Pattern "%" -> prefix ""
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("%"), "");
|
||||
// Pattern "_" -> prefix ""
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("_"), "");
|
||||
}
|
||||
|
||||
TEST(ExtractFixedPrefixTest, EscapedPercent) {
|
||||
using namespace milvus;
|
||||
// Pattern "100\%" -> prefix "100%" (escaped % is literal)
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("100\\%"), "100%");
|
||||
// Pattern "a\%b%" -> prefix "a%b"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\%b%"), "a%b");
|
||||
// Pattern "100\%\%" -> prefix "100%%"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("100\\%\\%"), "100%%");
|
||||
}
|
||||
|
||||
TEST(ExtractFixedPrefixTest, EscapedUnderscore) {
|
||||
using namespace milvus;
|
||||
// Pattern "a\_b" -> prefix "a_b" (escaped _ is literal)
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b"), "a_b");
|
||||
// Pattern "a\_b%" -> prefix "a_b"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b%"), "a_b");
|
||||
// Pattern "a\_b_c" -> prefix "a_b" (stops at unescaped _)
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\_b_c"), "a_b");
|
||||
}
|
||||
|
||||
TEST(ExtractFixedPrefixTest, MixedEscape) {
|
||||
using namespace milvus;
|
||||
// Pattern "10\%\_off%" -> prefix "10%_off"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("10\\%\\_off%"), "10%_off");
|
||||
// Pattern "a\%b\_c%d" -> prefix "a%b_c"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("a\\%b\\_c%d"), "a%b_c");
|
||||
}
|
||||
|
||||
TEST(ExtractFixedPrefixTest, NoWildcard) {
|
||||
using namespace milvus;
|
||||
// Pattern "abc" -> prefix "abc" (no wildcard)
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("abc"), "abc");
|
||||
// Pattern "hello world" -> prefix "hello world"
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern("hello world"), "hello world");
|
||||
}
|
||||
|
||||
TEST(ExtractFixedPrefixTest, EmptyPattern) {
|
||||
using namespace milvus;
|
||||
// Empty pattern -> empty prefix
|
||||
EXPECT_EQ(extract_fixed_prefix_from_pattern(""), "");
|
||||
}
|
||||
|
||||
@ -25,6 +25,7 @@
|
||||
#include <filesystem>
|
||||
#include "storage/FileWriter.h"
|
||||
#include "common/CDataType.h"
|
||||
#include "common/RegexQuery.h"
|
||||
#include "knowhere/log.h"
|
||||
#include "index/Meta.h"
|
||||
#include "common/Utils.h"
|
||||
@ -389,6 +390,29 @@ StringIndexSort::PrefixMatch(const std::string_view prefix) {
|
||||
return impl_->PrefixMatch(prefix, total_num_rows_);
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
StringIndexSort::PatternMatch(const std::string& pattern,
|
||||
proto::plan::OpType op) {
|
||||
assert(impl_ != nullptr);
|
||||
|
||||
if (op == proto::plan::OpType::PrefixMatch) {
|
||||
return PrefixMatch(pattern);
|
||||
}
|
||||
|
||||
// Support Match, PostfixMatch, InnerMatch
|
||||
// All can benefit from unique value deduplication
|
||||
if (op != proto::plan::OpType::Match &&
|
||||
op != proto::plan::OpType::PostfixMatch &&
|
||||
op != proto::plan::OpType::InnerMatch) {
|
||||
ThrowInfo(Unsupported,
|
||||
"StringIndexSort::PatternMatch only supports Match, "
|
||||
"PrefixMatch, PostfixMatch, InnerMatch, got op: {}",
|
||||
static_cast<int>(op));
|
||||
}
|
||||
|
||||
return impl_->PatternMatch(pattern, op, total_num_rows_);
|
||||
}
|
||||
|
||||
std::optional<std::string>
|
||||
StringIndexSort::Reverse_Lookup(size_t offset) const {
|
||||
assert(impl_ != nullptr);
|
||||
@ -822,20 +846,122 @@ StringIndexSortMemoryImpl::PrefixMatch(const std::string_view prefix,
|
||||
size_t total_num_rows) {
|
||||
TargetBitmap bitset(total_num_rows, false);
|
||||
|
||||
auto it = std::lower_bound(
|
||||
unique_values_.begin(), unique_values_.end(), std::string(prefix));
|
||||
// Use FindPrefixRange for O(log n) lookup of both start and end
|
||||
auto [start_idx, end_idx] = FindPrefixRange(std::string(prefix));
|
||||
|
||||
size_t idx = std::distance(unique_values_.begin(), it);
|
||||
|
||||
while (idx < unique_values_.size()) {
|
||||
if (!milvus::PrefixMatch(unique_values_[idx], prefix)) {
|
||||
break;
|
||||
}
|
||||
for (size_t idx = start_idx; idx < end_idx; ++idx) {
|
||||
const auto& posting_list = posting_lists_[idx];
|
||||
for (uint32_t row_id : posting_list) {
|
||||
bitset[row_id] = true;
|
||||
}
|
||||
++idx;
|
||||
}
|
||||
|
||||
return bitset;
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t>
|
||||
StringIndexSortMemoryImpl::FindPrefixRange(const std::string& prefix) const {
|
||||
if (prefix.empty()) {
|
||||
return {0, unique_values_.size()};
|
||||
}
|
||||
|
||||
// Binary search for start: first value >= prefix
|
||||
auto start_it =
|
||||
std::lower_bound(unique_values_.begin(), unique_values_.end(), prefix);
|
||||
size_t start_idx = std::distance(unique_values_.begin(), start_it);
|
||||
|
||||
// Compute "next prefix" for end boundary: "abc" -> "abd"
|
||||
// Range is [prefix, next_prefix), all strings starting with prefix
|
||||
std::string next_prefix = prefix;
|
||||
bool has_next = false;
|
||||
// Find rightmost char that can be incremented (not 0xFF)
|
||||
for (int i = next_prefix.size() - 1; i >= 0; --i) {
|
||||
if (static_cast<unsigned char>(next_prefix[i]) < 255) {
|
||||
++next_prefix[i];
|
||||
next_prefix.resize(i + 1);
|
||||
has_next = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
size_t end_idx;
|
||||
if (has_next) {
|
||||
// Binary search for end: first value >= next_prefix
|
||||
auto end_it = std::lower_bound(
|
||||
unique_values_.begin(), unique_values_.end(), next_prefix);
|
||||
end_idx = std::distance(unique_values_.begin(), end_it);
|
||||
} else {
|
||||
// All chars are 0xFF, no upper bound
|
||||
end_idx = unique_values_.size();
|
||||
}
|
||||
|
||||
return {start_idx, end_idx};
|
||||
}
|
||||
|
||||
bool
|
||||
StringIndexSortMemoryImpl::MatchValue(const std::string& value,
|
||||
const std::string& pattern,
|
||||
proto::plan::OpType op) const {
|
||||
switch (op) {
|
||||
case proto::plan::OpType::PostfixMatch:
|
||||
// Suffix match: value ends with pattern
|
||||
if (pattern.size() > value.size()) {
|
||||
return false;
|
||||
}
|
||||
return value.compare(value.size() - pattern.size(),
|
||||
pattern.size(),
|
||||
pattern) == 0;
|
||||
case proto::plan::OpType::InnerMatch:
|
||||
// Contains match: value contains pattern
|
||||
return value.find(pattern) != std::string::npos;
|
||||
default:
|
||||
// For Match op, use regex (handled separately)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
StringIndexSortMemoryImpl::PatternMatch(const std::string& pattern,
|
||||
proto::plan::OpType op,
|
||||
size_t total_num_rows) {
|
||||
TargetBitmap bitset(total_num_rows, false);
|
||||
|
||||
// For PostfixMatch and InnerMatch, no prefix optimization possible
|
||||
// Still benefits from unique value deduplication
|
||||
if (op == proto::plan::OpType::PostfixMatch ||
|
||||
op == proto::plan::OpType::InnerMatch) {
|
||||
// Iterate over all unique values
|
||||
for (size_t idx = 0; idx < unique_values_.size(); ++idx) {
|
||||
if (MatchValue(unique_values_[idx], pattern, op)) {
|
||||
const auto& posting_list = posting_lists_[idx];
|
||||
for (uint32_t row_id : posting_list) {
|
||||
bitset[row_id] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return bitset;
|
||||
}
|
||||
|
||||
// For Match op, use prefix optimization + regex
|
||||
std::string prefix = extract_fixed_prefix_from_pattern(pattern);
|
||||
|
||||
// Find the range of unique values to check
|
||||
auto [start_idx, end_idx] = FindPrefixRange(prefix);
|
||||
|
||||
// Build regex matcher
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(pattern);
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
|
||||
// Iterate over unique values in range (each value checked only once)
|
||||
for (size_t idx = start_idx; idx < end_idx; ++idx) {
|
||||
if (matcher(unique_values_[idx])) {
|
||||
// Match found, set all row IDs in posting list
|
||||
const auto& posting_list = posting_lists_[idx];
|
||||
for (uint32_t row_id : posting_list) {
|
||||
bitset[row_id] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bitset;
|
||||
@ -1149,23 +1275,113 @@ StringIndexSortMmapImpl::PrefixMatch(const std::string_view prefix,
|
||||
size_t total_num_rows) {
|
||||
TargetBitmap bitset(total_num_rows, false);
|
||||
|
||||
// Find the first string that is >= prefix
|
||||
size_t idx = LowerBound(prefix);
|
||||
// Use FindPrefixRange for O(log n) lookup of both start and end
|
||||
auto [start_idx, end_idx] = FindPrefixRange(std::string(prefix));
|
||||
|
||||
while (idx < unique_count_) {
|
||||
for (size_t idx = start_idx; idx < end_idx; ++idx) {
|
||||
MmapEntry entry = GetEntry(idx);
|
||||
std::string_view entry_sv = entry.get_string_view();
|
||||
|
||||
if (entry_sv.size() < prefix.size() ||
|
||||
entry_sv.substr(0, prefix.size()) != prefix) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Add all row_ids for this matching string
|
||||
entry.for_each_row_id(
|
||||
[&bitset](uint32_t row_id) { bitset.set(row_id); });
|
||||
}
|
||||
|
||||
++idx;
|
||||
return bitset;
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t>
|
||||
StringIndexSortMmapImpl::FindPrefixRange(const std::string& prefix) const {
|
||||
if (prefix.empty()) {
|
||||
return {0, unique_count_};
|
||||
}
|
||||
|
||||
// Binary search for start
|
||||
size_t start_idx = LowerBound(prefix);
|
||||
|
||||
// Compute "next prefix" for end boundary: "abc" -> "abd"
|
||||
std::string next_prefix = prefix;
|
||||
bool has_next = false;
|
||||
for (int i = next_prefix.size() - 1; i >= 0; --i) {
|
||||
if (static_cast<unsigned char>(next_prefix[i]) < 255) {
|
||||
++next_prefix[i];
|
||||
next_prefix.resize(i + 1);
|
||||
has_next = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
size_t end_idx;
|
||||
if (has_next) {
|
||||
end_idx = LowerBound(next_prefix);
|
||||
} else {
|
||||
end_idx = unique_count_;
|
||||
}
|
||||
|
||||
return {start_idx, end_idx};
|
||||
}
|
||||
|
||||
bool
|
||||
StringIndexSortMmapImpl::MatchValue(const std::string& value,
|
||||
const std::string& pattern,
|
||||
proto::plan::OpType op) const {
|
||||
switch (op) {
|
||||
case proto::plan::OpType::PostfixMatch:
|
||||
// Suffix match: value ends with pattern
|
||||
if (pattern.size() > value.size()) {
|
||||
return false;
|
||||
}
|
||||
return value.compare(value.size() - pattern.size(),
|
||||
pattern.size(),
|
||||
pattern) == 0;
|
||||
case proto::plan::OpType::InnerMatch:
|
||||
// Contains match: value contains pattern
|
||||
return value.find(pattern) != std::string::npos;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
StringIndexSortMmapImpl::PatternMatch(const std::string& pattern,
|
||||
proto::plan::OpType op,
|
||||
size_t total_num_rows) {
|
||||
TargetBitmap bitset(total_num_rows, false);
|
||||
|
||||
// For PostfixMatch and InnerMatch, no prefix optimization possible
|
||||
// Still benefits from unique value deduplication
|
||||
if (op == proto::plan::OpType::PostfixMatch ||
|
||||
op == proto::plan::OpType::InnerMatch) {
|
||||
for (size_t idx = 0; idx < unique_count_; ++idx) {
|
||||
MmapEntry entry = GetEntry(idx);
|
||||
std::string_view sv = entry.get_string_view();
|
||||
|
||||
if (MatchValue(std::string(sv), pattern, op)) {
|
||||
entry.for_each_row_id(
|
||||
[&bitset](uint32_t row_id) { bitset.set(row_id); });
|
||||
}
|
||||
}
|
||||
return bitset;
|
||||
}
|
||||
|
||||
// For Match op, use prefix optimization + regex
|
||||
std::string prefix = extract_fixed_prefix_from_pattern(pattern);
|
||||
|
||||
// Find the range of unique values to check
|
||||
auto [start_idx, end_idx] = FindPrefixRange(prefix);
|
||||
|
||||
// Build regex matcher
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(pattern);
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
|
||||
// Iterate over unique values in range (each value checked only once)
|
||||
for (size_t idx = start_idx; idx < end_idx; ++idx) {
|
||||
MmapEntry entry = GetEntry(idx);
|
||||
std::string_view sv = entry.get_string_view();
|
||||
|
||||
if (matcher(sv)) {
|
||||
// Match found, set all row IDs in posting list
|
||||
entry.for_each_row_id(
|
||||
[&bitset](uint32_t row_id) { bitset.set(row_id); });
|
||||
}
|
||||
}
|
||||
|
||||
return bitset;
|
||||
|
||||
@ -117,6 +117,14 @@ class StringIndexSort : public StringIndex {
|
||||
const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix) override;
|
||||
|
||||
bool
|
||||
SupportPatternMatch() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
PatternMatch(const std::string& pattern, proto::plan::OpType op) override;
|
||||
|
||||
std::optional<std::string>
|
||||
Reverse_Lookup(size_t offset) const override;
|
||||
|
||||
@ -196,6 +204,11 @@ class StringIndexSortImpl {
|
||||
virtual const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix, size_t total_num_rows) = 0;
|
||||
|
||||
virtual const TargetBitmap
|
||||
PatternMatch(const std::string& pattern,
|
||||
proto::plan::OpType op,
|
||||
size_t total_num_rows) = 0;
|
||||
|
||||
virtual std::optional<std::string>
|
||||
Reverse_Lookup(size_t offset,
|
||||
size_t total_num_rows,
|
||||
@ -273,6 +286,11 @@ class StringIndexSortMemoryImpl : public StringIndexSortImpl {
|
||||
const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix, size_t total_num_rows) override;
|
||||
|
||||
const TargetBitmap
|
||||
PatternMatch(const std::string& pattern,
|
||||
proto::plan::OpType op,
|
||||
size_t total_num_rows) override;
|
||||
|
||||
std::optional<std::string>
|
||||
Reverse_Lookup(size_t offset,
|
||||
size_t total_num_rows,
|
||||
@ -290,6 +308,16 @@ class StringIndexSortMemoryImpl : public StringIndexSortImpl {
|
||||
size_t
|
||||
FindValueIndex(const std::string& value) const;
|
||||
|
||||
// Helper to find the range of unique values that start with a prefix
|
||||
std::pair<size_t, size_t>
|
||||
FindPrefixRange(const std::string& prefix) const;
|
||||
|
||||
// Check if value matches pattern based on op type
|
||||
bool
|
||||
MatchValue(const std::string& value,
|
||||
const std::string& pattern,
|
||||
proto::plan::OpType op) const;
|
||||
|
||||
void
|
||||
BuildFromMap(std::map<std::string, PostingList>&& unique_map,
|
||||
size_t total_num_rows,
|
||||
@ -390,6 +418,11 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl {
|
||||
const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix, size_t total_num_rows) override;
|
||||
|
||||
const TargetBitmap
|
||||
PatternMatch(const std::string& pattern,
|
||||
proto::plan::OpType op,
|
||||
size_t total_num_rows) override;
|
||||
|
||||
std::optional<std::string>
|
||||
Reverse_Lookup(size_t offset,
|
||||
size_t total_num_rows,
|
||||
@ -407,6 +440,12 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl {
|
||||
size_t
|
||||
FindValueIndex(const std::string& value) const;
|
||||
|
||||
// Check if value matches pattern based on op type
|
||||
bool
|
||||
MatchValue(const std::string& value,
|
||||
const std::string& pattern,
|
||||
proto::plan::OpType op) const;
|
||||
|
||||
// Binary search helpers
|
||||
size_t
|
||||
LowerBound(const std::string_view& value) const;
|
||||
@ -414,6 +453,10 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl {
|
||||
size_t
|
||||
UpperBound(const std::string_view& value) const;
|
||||
|
||||
// Find the range [start, end) of unique values that start with a prefix
|
||||
std::pair<size_t, size_t>
|
||||
FindPrefixRange(const std::string& prefix) const;
|
||||
|
||||
MmapEntry
|
||||
GetEntry(size_t idx) const {
|
||||
const uint8_t* str_ptr = string_data_start_ + string_offsets_[idx];
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
#include "index/StringIndexSort.h"
|
||||
#include "index/IndexFactory.h"
|
||||
#include "pb/plan.pb.h"
|
||||
#include "test_utils/indexbuilder_test_utils.h"
|
||||
|
||||
constexpr int64_t nb = 100;
|
||||
@ -605,3 +606,391 @@ TEST(StringIndexSortStandaloneTest, StringIndexSortSerialization) {
|
||||
ASSERT_GT(bitset.count(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
// ============== PatternMatch Tests ==============
|
||||
|
||||
using milvus::proto::plan::OpType;
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PatternMatchBasicMemory) {
|
||||
std::vector<std::string> test_data = {
|
||||
"apple", // 0
|
||||
"application", // 1
|
||||
"apply", // 2
|
||||
"banana", // 3
|
||||
"band", // 4
|
||||
"cat", // 5
|
||||
"category", // 6
|
||||
"dog", // 7
|
||||
"application", // 8 (duplicate)
|
||||
"apple" // 9 (duplicate)
|
||||
};
|
||||
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Test pattern "app%" - should match apple, application, apply
|
||||
{
|
||||
auto bitset = index->PatternMatch("app%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 5); // apple(2), application(2), apply(1)
|
||||
ASSERT_TRUE(bitset[0]); // apple
|
||||
ASSERT_TRUE(bitset[1]); // application
|
||||
ASSERT_TRUE(bitset[2]); // apply
|
||||
ASSERT_TRUE(bitset[8]); // application (dup)
|
||||
ASSERT_TRUE(bitset[9]); // apple (dup)
|
||||
}
|
||||
|
||||
// Test pattern "app%ion" - should match application only
|
||||
{
|
||||
auto bitset = index->PatternMatch("app%ion", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 2); // application appears twice
|
||||
ASSERT_TRUE(bitset[1]); // application
|
||||
ASSERT_TRUE(bitset[8]); // application (dup)
|
||||
}
|
||||
|
||||
// Test pattern "%ana%" - should match banana
|
||||
{
|
||||
auto bitset = index->PatternMatch("%ana%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 1);
|
||||
ASSERT_TRUE(bitset[3]); // banana
|
||||
}
|
||||
|
||||
// Test pattern "cat%" - should match cat, category
|
||||
{
|
||||
auto bitset = index->PatternMatch("cat%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 2);
|
||||
ASSERT_TRUE(bitset[5]); // cat
|
||||
ASSERT_TRUE(bitset[6]); // category
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PatternMatchWithUnderscoreMemory) {
|
||||
std::vector<std::string> test_data = {
|
||||
"abc", // 0
|
||||
"aXc", // 1
|
||||
"a1c", // 2
|
||||
"abcd", // 3
|
||||
"ac", // 4
|
||||
"abbc", // 5
|
||||
};
|
||||
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Test pattern "a_c" - matches any single character between a and c
|
||||
{
|
||||
auto bitset = index->PatternMatch("a_c", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 3); // abc, aXc, a1c
|
||||
ASSERT_TRUE(bitset[0]); // abc
|
||||
ASSERT_TRUE(bitset[1]); // aXc
|
||||
ASSERT_TRUE(bitset[2]); // a1c
|
||||
ASSERT_FALSE(bitset[3]); // abcd - too long
|
||||
ASSERT_FALSE(bitset[4]); // ac - too short
|
||||
ASSERT_FALSE(bitset[5]); // abbc - two chars between
|
||||
}
|
||||
|
||||
// Test pattern "a_c%" - prefix with underscore
|
||||
{
|
||||
auto bitset = index->PatternMatch("a_c%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 4); // abc, aXc, a1c, abcd
|
||||
ASSERT_TRUE(bitset[0]); // abc
|
||||
ASSERT_TRUE(bitset[1]); // aXc
|
||||
ASSERT_TRUE(bitset[2]); // a1c
|
||||
ASSERT_TRUE(bitset[3]); // abcd
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PatternMatchEscapeMemory) {
|
||||
std::vector<std::string> test_data = {
|
||||
"100%", // 0 - contains literal %
|
||||
"100percent", // 1
|
||||
"50%off", // 2 - contains literal %
|
||||
"a_b", // 3 - contains literal _
|
||||
"axb", // 4
|
||||
"a%b", // 5 - contains literal %
|
||||
"ab", // 6
|
||||
};
|
||||
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Test pattern "100\%" - matches literal "100%"
|
||||
{
|
||||
auto bitset = index->PatternMatch("100\\%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 1);
|
||||
ASSERT_TRUE(bitset[0]); // 100%
|
||||
}
|
||||
|
||||
// Test pattern "%\%%" - matches strings containing literal %
|
||||
{
|
||||
auto bitset = index->PatternMatch("%\\%%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 3);
|
||||
ASSERT_TRUE(bitset[0]); // 100%
|
||||
ASSERT_TRUE(bitset[2]); // 50%off
|
||||
ASSERT_TRUE(bitset[5]); // a%b
|
||||
}
|
||||
|
||||
// Test pattern "a\_b" - matches literal "a_b"
|
||||
{
|
||||
auto bitset = index->PatternMatch("a\\_b", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 1);
|
||||
ASSERT_TRUE(bitset[3]); // a_b
|
||||
}
|
||||
|
||||
// Test pattern "a\%b" - matches literal "a%b"
|
||||
{
|
||||
auto bitset = index->PatternMatch("a\\%b", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 1);
|
||||
ASSERT_TRUE(bitset[5]); // a%b
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PatternMatchNoPrefix) {
|
||||
std::vector<std::string> test_data = {
|
||||
"hello world", // 0
|
||||
"world hello", // 1
|
||||
"hello", // 2
|
||||
"world", // 3
|
||||
"say hello", // 4
|
||||
};
|
||||
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Test pattern "%hello" - postfix match (no fixed prefix)
|
||||
{
|
||||
auto bitset = index->PatternMatch("%hello", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 3);
|
||||
ASSERT_TRUE(bitset[1]); // world hello
|
||||
ASSERT_TRUE(bitset[2]); // hello
|
||||
ASSERT_TRUE(bitset[4]); // say hello
|
||||
}
|
||||
|
||||
// Test pattern "%world%" - inner match (no fixed prefix)
|
||||
{
|
||||
auto bitset = index->PatternMatch("%world%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 3);
|
||||
ASSERT_TRUE(bitset[0]); // hello world
|
||||
ASSERT_TRUE(bitset[1]); // world hello
|
||||
ASSERT_TRUE(bitset[3]); // world
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PatternMatchMmap) {
|
||||
std::vector<std::string> test_data = {
|
||||
"apple",
|
||||
"application",
|
||||
"apply",
|
||||
"banana",
|
||||
"band",
|
||||
};
|
||||
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Serialize and reload with mmap
|
||||
auto binary_set = index->Serialize(config);
|
||||
|
||||
milvus::Config mmap_config;
|
||||
mmap_config[milvus::index::MMAP_FILE_PATH] =
|
||||
"/tmp/test_pattern_match_mmap.idx";
|
||||
|
||||
auto mmap_index = milvus::index::CreateStringIndexSort({});
|
||||
mmap_index->Load(binary_set, mmap_config);
|
||||
|
||||
// Test pattern "app%ion"
|
||||
{
|
||||
auto bitset = mmap_index->PatternMatch("app%ion", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 1);
|
||||
ASSERT_TRUE(bitset[1]); // application
|
||||
}
|
||||
|
||||
// Test pattern "ban%" with underscore
|
||||
{
|
||||
auto bitset = mmap_index->PatternMatch("ban%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 2); // banana, band
|
||||
ASSERT_TRUE(bitset[3]);
|
||||
ASSERT_TRUE(bitset[4]);
|
||||
}
|
||||
|
||||
std::remove("/tmp/test_pattern_match_mmap.idx");
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PatternMatchComplexEscape) {
|
||||
std::vector<std::string> test_data = {
|
||||
"10%_off", // 0 - contains both % and _
|
||||
"10%aoff", // 1
|
||||
"10%boff", // 2
|
||||
"10a_off", // 3
|
||||
"discount", // 4
|
||||
};
|
||||
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Test pattern "10\%\_off" - matches literal "10%_off"
|
||||
{
|
||||
auto bitset = index->PatternMatch("10\\%\\_off", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 1);
|
||||
ASSERT_TRUE(bitset[0]);
|
||||
}
|
||||
|
||||
// Test pattern "10\%_off" - matches "10%" followed by any single char and "off"
|
||||
{
|
||||
auto bitset = index->PatternMatch("10\\%_off", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 3); // 10%_off, 10%aoff, 10%boff
|
||||
ASSERT_TRUE(bitset[0]);
|
||||
ASSERT_TRUE(bitset[1]);
|
||||
ASSERT_TRUE(bitset[2]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PatternMatchPrefixOp) {
|
||||
std::vector<std::string> test_data = {
|
||||
"apple",
|
||||
"application",
|
||||
"banana",
|
||||
};
|
||||
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Test PrefixMatch op - should delegate to PrefixMatch
|
||||
auto bitset = index->PatternMatch("app", OpType::PrefixMatch);
|
||||
ASSERT_EQ(bitset.count(), 2); // apple, application
|
||||
ASSERT_TRUE(bitset[0]);
|
||||
ASSERT_TRUE(bitset[1]);
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PatternMatchDuplicateValues) {
|
||||
// Test that duplicate values are handled correctly
|
||||
// (each unique value should only be regex-matched once)
|
||||
std::vector<std::string> test_data;
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
test_data.push_back("repeated_value");
|
||||
}
|
||||
test_data.push_back("other_value");
|
||||
|
||||
milvus::Config config;
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Pattern that matches repeated_value
|
||||
auto bitset = index->PatternMatch("repeated%", OpType::Match);
|
||||
ASSERT_EQ(bitset.count(), 1000);
|
||||
|
||||
// Pattern that matches other_value
|
||||
auto bitset2 = index->PatternMatch("other%", OpType::Match);
|
||||
ASSERT_EQ(bitset2.count(), 1);
|
||||
ASSERT_TRUE(bitset2[1000]);
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PostfixMatch) {
|
||||
std::vector<std::string> test_data = {
|
||||
"hello_world", // 0 - ends with "world"
|
||||
"new_world", // 1 - ends with "world"
|
||||
"world_peace", // 2 - ends with "peace"
|
||||
"hello", // 3
|
||||
"world", // 4 - ends with "world"
|
||||
};
|
||||
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// PostfixMatch: find strings ending with "world"
|
||||
auto bitset = index->PatternMatch("world", OpType::PostfixMatch);
|
||||
ASSERT_EQ(bitset.count(), 3);
|
||||
ASSERT_TRUE(bitset[0]); // hello_world
|
||||
ASSERT_TRUE(bitset[1]); // new_world
|
||||
ASSERT_FALSE(bitset[2]); // world_peace
|
||||
ASSERT_FALSE(bitset[3]); // hello
|
||||
ASSERT_TRUE(bitset[4]); // world
|
||||
|
||||
// PostfixMatch: find strings ending with "peace"
|
||||
auto bitset2 = index->PatternMatch("peace", OpType::PostfixMatch);
|
||||
ASSERT_EQ(bitset2.count(), 1);
|
||||
ASSERT_TRUE(bitset2[2]); // world_peace
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, InnerMatch) {
|
||||
std::vector<std::string> test_data = {
|
||||
"hello_world", // 0 - contains "world"
|
||||
"new_world", // 1 - contains "world"
|
||||
"world_peace", // 2 - contains "world"
|
||||
"hello", // 3 - no "world"
|
||||
"worldwide", // 4 - contains "world"
|
||||
};
|
||||
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// InnerMatch: find strings containing "world"
|
||||
auto bitset = index->PatternMatch("world", OpType::InnerMatch);
|
||||
ASSERT_EQ(bitset.count(), 4);
|
||||
ASSERT_TRUE(bitset[0]); // hello_world
|
||||
ASSERT_TRUE(bitset[1]); // new_world
|
||||
ASSERT_TRUE(bitset[2]); // world_peace
|
||||
ASSERT_FALSE(bitset[3]); // hello
|
||||
ASSERT_TRUE(bitset[4]); // worldwide
|
||||
|
||||
// InnerMatch: find strings containing "ello"
|
||||
auto bitset2 = index->PatternMatch("ello", OpType::InnerMatch);
|
||||
ASSERT_EQ(bitset2.count(), 2);
|
||||
ASSERT_TRUE(bitset2[0]); // hello_world
|
||||
ASSERT_FALSE(bitset2[1]);
|
||||
ASSERT_FALSE(bitset2[2]);
|
||||
ASSERT_TRUE(bitset2[3]); // hello
|
||||
ASSERT_FALSE(bitset2[4]);
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, PostfixMatchMmap) {
|
||||
std::vector<std::string> test_data = {
|
||||
"application", // 0 - ends with "ion"
|
||||
"revolution", // 1 - ends with "ion"
|
||||
"apple", // 2
|
||||
};
|
||||
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Serialize and reload as mmap
|
||||
auto binaryset = index->Serialize({});
|
||||
auto mmap_index = milvus::index::CreateStringIndexSort({});
|
||||
mmap_index->Load(binaryset, {});
|
||||
|
||||
// PostfixMatch on mmap
|
||||
auto bitset = mmap_index->PatternMatch("ion", OpType::PostfixMatch);
|
||||
ASSERT_EQ(bitset.count(), 2);
|
||||
ASSERT_TRUE(bitset[0]); // application
|
||||
ASSERT_TRUE(bitset[1]); // revolution
|
||||
ASSERT_FALSE(bitset[2]); // apple
|
||||
}
|
||||
|
||||
TEST(StringIndexSortPatternMatchTest, InnerMatchMmap) {
|
||||
std::vector<std::string> test_data = {
|
||||
"application", // 0 - contains "cat"
|
||||
"category", // 1 - contains "cat"
|
||||
"dog", // 2 - no "cat"
|
||||
};
|
||||
|
||||
auto index = milvus::index::CreateStringIndexSort({});
|
||||
index->Build(test_data.size(), test_data.data());
|
||||
|
||||
// Serialize and reload as mmap
|
||||
auto binaryset = index->Serialize({});
|
||||
auto mmap_index = milvus::index::CreateStringIndexSort({});
|
||||
mmap_index->Load(binaryset, {});
|
||||
|
||||
// InnerMatch on mmap
|
||||
auto bitset = mmap_index->PatternMatch("cat", OpType::InnerMatch);
|
||||
ASSERT_EQ(bitset.count(), 2);
|
||||
ASSERT_TRUE(bitset[0]); // application
|
||||
ASSERT_TRUE(bitset[1]); // category
|
||||
ASSERT_FALSE(bitset[2]); // dog
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user