milvus/internal/core/unittest/test_text_match.cpp
Spade A 52c7d7dd80
fix: offset combined with term should be based on Token positions in phrase match (#39931)
fix: #39711

Unlike English sentence where each words are parsed exactly once and one
after one with position length 1, one Chinese word may be parsed to
multiple words with position length larger than 1.

For example, "badminton and skiing" will be parsed to Token{ start: 0,
length: 1, text: "badminton" }, Token{ start: 1, length: 1, text: "and"
}, and Token{ start: 2, length: 1, text: "tennis" }.

While for exmaple for Chinsese: "羽毛球和滑雪" may be parsed to Token{ start:
0, length: 2, text: "羽毛" }, Token{ start: 0, length: 3, text: "羽毛球" },
Token{ start: 3, length: 1, text: "和" }, and Token{ start: 4, length: 2,
text: "滑雪" }.

This PR fix that the code not recognizes this situation.

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
2025-02-18 20:38:51 +08:00

1001 lines
35 KiB
C++

// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <string>
#include "common/Schema.h"
#include "segcore/SegmentGrowing.h"
#include "segcore/SegmentGrowingImpl.h"
#include "test_utils/DataGen.h"
#include "test_utils/GenExprProto.h"
#include "query/PlanProto.h"
#include "query/ExecPlanNodeVisitor.h"
#include "expr/ITypeExpr.h"
#include "segcore/segment_c.h"
#include "test_utils/storage_test_utils.h"
using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
namespace {
SchemaPtr
GenTestSchema(std::map<std::string, std::string> params = {},
bool nullable = false) {
auto schema = std::make_shared<Schema>();
{
FieldMeta f(FieldName("pk"), FieldId(100), DataType::INT64, false);
schema->AddField(std::move(f));
schema->set_primary_field_id(FieldId(100));
}
{
FieldMeta f(FieldName("str"),
FieldId(101),
DataType::VARCHAR,
65536,
nullable,
true,
true,
params);
schema->AddField(std::move(f));
}
{
FieldMeta f(FieldName("fvec"),
FieldId(102),
DataType::VECTOR_FLOAT,
16,
knowhere::metric::L2,
false);
schema->AddField(std::move(f));
}
return schema;
}
std::shared_ptr<milvus::plan::FilterBitsNode>
GetMatchExpr(SchemaPtr schema,
const std::string& query,
proto::plan::OpType op,
int64_t slop = 0) {
const auto& str_meta = schema->operator[](FieldName("str"));
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr = test::GenUnaryRangeExpr(op, query);
unary_range_expr->set_allocated_column_info(column_info);
auto generic_for_slop = milvus::test::GenGenericValue(slop);
unary_range_expr->add_extra_values()->CopyFrom(*generic_for_slop);
delete generic_for_slop;
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(*schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
return parsed;
};
std::shared_ptr<milvus::plan::FilterBitsNode>
GetNotMatchExpr(SchemaPtr schema,
const std::string& query,
proto::plan::OpType op,
int64_t slop = 0) {
const auto& str_meta = schema->operator[](FieldName("str"));
proto::plan::GenericValue val;
val.set_string_val(query);
std::vector<proto::plan::GenericValue> extra_values;
auto generic_for_slop = milvus::test::GenGenericValue(slop);
extra_values.push_back(*generic_for_slop);
delete generic_for_slop;
auto child_expr = std::make_shared<expr::UnaryRangeFilterExpr>(
milvus::expr::ColumnInfo(str_meta.get_id(), DataType::VARCHAR),
op,
val,
extra_values);
auto expr = std::make_shared<expr::LogicalUnaryExpr>(
expr::LogicalUnaryExpr::OpType::LogicalNot, child_expr);
auto parsed =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, expr);
return parsed;
};
} // namespace
TEST(ParseJson, Naive) {
{
std::string s(R"({"tokenizer": "jieba"})");
nlohmann::json j = nlohmann::json::parse(s);
auto m = j.get<std::map<std::string, std::string>>();
for (const auto& [k, v] : m) {
std::cout << k << ": " << v << std::endl;
}
}
{
std::string s(
R"({"analyzer":"stop","stop_words":["an","the"],"case_insensitive":false})");
nlohmann::json j = nlohmann::json::parse(s);
for (const auto& [key, value] : j.items()) {
std::cout << key << ": " << value.dump() << std::endl;
}
}
}
TEST(ParseTokenizerParams, NoTokenizerParams) {
TypeParams params{{"k", "v"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ("{}", std::string(p));
}
TEST(ParseTokenizerParams, Default) {
TypeParams params{{"analyzer_params", R"({"tokenizer": "standard"})"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ(params.at("analyzer_params"), p);
}
TEST(TextMatch, Index) {
using Index = index::TextMatchIndex;
auto index = std::make_unique<Index>(std::numeric_limits<int64_t>::max(),
"unique_id",
"milvus_tokenizer",
"{}");
index->CreateReader();
index->AddText("football, basketball, pingpang", true, 0);
index->AddText("", false, 1);
index->AddText("swimming, football", true, 2);
index->Commit();
index->Reload();
{
auto res = index->MatchQuery("football");
ASSERT_EQ(res.size(), 3);
ASSERT_TRUE(res[0]);
ASSERT_FALSE(res[1]);
ASSERT_TRUE(res[2]);
auto res1 = index->IsNull();
ASSERT_FALSE(res1[0]);
ASSERT_TRUE(res1[1]);
ASSERT_FALSE(res1[2]);
auto res2 = index->IsNotNull();
ASSERT_TRUE(res2[0]);
ASSERT_FALSE(res2[1]);
ASSERT_TRUE(res2[2]);
res = index->MatchQuery("nothing");
ASSERT_EQ(res.size(), 0);
}
{
auto res = index->PhraseMatchQuery("football", 0);
ASSERT_EQ(res.size(), 3);
ASSERT_TRUE(res[0]);
ASSERT_FALSE(res[1]);
ASSERT_TRUE(res[2]);
auto res1 = index->PhraseMatchQuery("swimming football", 0);
ASSERT_EQ(res1.size(), 3);
ASSERT_FALSE(res1[0]);
ASSERT_FALSE(res1[1]);
ASSERT_TRUE(res1[2]);
auto res2 = index->PhraseMatchQuery("football swimming", 0);
ASSERT_EQ(res2.size(), 0);
auto res3 = index->PhraseMatchQuery("football swimming", 1);
ASSERT_EQ(res3.size(), 0);
auto res4 = index->PhraseMatchQuery("football swimming", 2);
ASSERT_EQ(res4.size(), 3);
ASSERT_FALSE(res4[0]);
ASSERT_FALSE(res4[1]);
ASSERT_TRUE(res4[2]);
}
}
TEST(TextMatch, GrowingNaive) {
auto schema = GenTestSchema();
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"football, basketball, pingpang",
"swimming, football"};
int64_t N = 2;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
seg->PreInsert(N);
seg->Insert(0,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "football", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "football", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "swimming", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "swimming", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
}
{
auto expr =
GetMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 =
GetNotMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
auto expr2 =
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch);
final = ExecuteQueryExpr(expr2, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
auto expr3 =
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch, 1);
final = ExecuteQueryExpr(expr3, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
}
TEST(TextMatch, GrowingNaiveNullable) {
auto schema = GenTestSchema({}, true);
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {
"football, basketball, pingpang", "swimming, football", ""};
std::vector<bool> raw_str_valid = {true, true, false};
int64_t N = 3;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = raw_str_valid[i];
}
seg->PreInsert(N);
seg->Insert(0,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "football", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "football", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "swimming", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "swimming", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
auto expr =
GetMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 =
GetNotMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
auto expr2 =
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch);
final = ExecuteQueryExpr(expr2, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
auto expr3 =
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch, 1);
final = ExecuteQueryExpr(expr3, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
TEST(TextMatch, SealedNaive) {
auto schema = GenTestSchema();
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"football, basketball, pingpang",
"swimming, football"};
int64_t N = 2;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
SealedLoadFieldData(raw_data, *seg);
seg->CreateTextIndex(FieldId(101));
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "football", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "football", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "swimming", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "swimming", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
}
{
auto expr =
GetMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 =
GetNotMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
auto expr2 =
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch);
final = ExecuteQueryExpr(expr2, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
auto expr3 =
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch, 1);
final = ExecuteQueryExpr(expr3, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
}
TEST(TextMatch, SealedNaiveNullable) {
auto schema = GenTestSchema({}, true);
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {
"football, basketball, pingpang", "swimming, football", ""};
std::vector<bool> raw_str_valid = {true, true, false};
int64_t N = 3;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = raw_str_valid[i];
}
SealedLoadFieldData(raw_data, *seg);
seg->CreateTextIndex(FieldId(101));
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "football", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "football", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "swimming", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "swimming", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
auto expr =
GetMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 =
GetNotMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
auto expr2 =
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch);
final = ExecuteQueryExpr(expr2, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
auto expr3 =
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch, 1);
final = ExecuteQueryExpr(expr3, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
TEST(TextMatch, GrowingJieBa) {
auto schema = GenTestSchema({
{"enable_match", "true"},
{"enable_analyzer", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
});
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
int64_t N = 2;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
seg->PreInsert(N);
seg->Insert(0,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "青铜", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "青铜", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "黄金", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "黄金", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "时代", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "时代", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
}
{
BitsetType final;
auto expr = GetMatchExpr(schema, "黄金时代", OpType::PhraseMatch);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
}
}
TEST(TextMatch, GrowingJieBaNullable) {
auto schema = GenTestSchema(
{
{"enable_match", "true"},
{"enable_tokenizer", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
},
true);
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代", ""};
std::vector<bool> raw_str_valid = {true, true, false};
int64_t N = 3;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = raw_str_valid[i];
}
seg->PreInsert(N);
seg->Insert(0,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "青铜", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "青铜", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "黄金", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "黄金", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "时代", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "时代", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
BitsetType final;
auto expr = GetMatchExpr(schema, "黄金时代", OpType::PhraseMatch);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
}
}
TEST(TextMatch, SealedJieBa) {
auto schema = GenTestSchema({
{"enable_match", "true"},
{"enable_analyzer", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
});
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
int64_t N = 2;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
SealedLoadFieldData(raw_data, *seg);
seg->CreateTextIndex(FieldId(101));
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "青铜", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "青铜", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "黄金", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "黄金", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "时代", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotMatchExpr(schema, "时代", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
}
{
BitsetType final;
auto expr = GetMatchExpr(schema, "黄金时代", OpType::PhraseMatch);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
}
}
TEST(TextMatch, SealedJieBaNullable) {
auto schema = GenTestSchema(
{
{"enable_match", "true"},
{"enable_tokenizer", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
},
true);
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代", ""};
std::vector<bool> raw_str_valid = {true, true, false};
int64_t N = 3;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = raw_str_valid[i];
}
SealedLoadFieldData(raw_data, *seg);
seg->CreateTextIndex(FieldId(101));
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "青铜", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "青铜", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "黄金", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "黄金", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
BitsetType final;
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
auto expr = GetMatchExpr(schema, "时代", op);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotMatchExpr(schema, "时代", op);
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
{
BitsetType final;
auto expr = GetMatchExpr(schema, "黄金时代", OpType::PhraseMatch);
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
}
}
// Test that growing segment loading flushed binlogs will build text match index.
TEST(TextMatch, GrowingLoadData) {
int64_t N = 7;
auto schema = GenTestSchema({}, true);
schema->AddField(FieldName("RowID"), FieldId(0), DataType::INT64, false);
schema->AddField(
FieldName("Timestamp"), FieldId(1), DataType::INT64, false);
std::vector<std::string> raw_str = {"football, basketball, pingpang",
"swimming, football",
"golf",
"",
"baseball",
"kungfu, football",
""};
auto raw_data = DataGen(schema, N);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = true;
}
// so we cannot match the second row
str_col_valid->at(1) = false;
auto storage_config = get_default_local_storage_config();
auto cm = storage::CreateChunkManager(storage_config);
auto load_info = PrepareInsertBinlog(
1,
2,
3,
storage_config.root_path + "/" + "test_growing_segment_load_data",
raw_data,
cm);
auto segment = CreateGrowingSegment(schema, empty_index_meta);
auto status = LoadFieldData(segment.get(), &load_info);
ASSERT_EQ(status.error_code, Success);
ASSERT_EQ(segment->get_real_count(), N);
ASSERT_NE(segment->get_field_avg_size(FieldId(101)), 0);
// Check whether the text index has been built.
auto expr = GetMatchExpr(schema, "football", OpType::TextMatch);
BitsetType final;
final = ExecuteQueryExpr(expr, segment.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
ASSERT_FALSE(final[3]);
ASSERT_FALSE(final[4]);
ASSERT_TRUE(final[5]);
ASSERT_FALSE(final[6]);
}