mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-01 08:28:10 +08:00
### Is there an existing issue for this? - [x] I have searched the existing issues --- Please see: https://github.com/milvus-io/milvus/issues/44593 for the background This PR makes https://github.com/milvus-io/milvus/pull/44638 redundant, which can be closed. The PR comments for the original implementation suggested an alternative and a better approach, this new PR has that implementation. --- This PR - Adds an optional `minimum_should_match` argument to `text_match(...)` and wires it through the parser, planner/visitor, index bindings, and client-level tests/examples so full-text queries can require a minimum number of tokens to match. Motivation - Provide a way to require an expression to match a minimum number of tokens in lexical search. What changed - Parser / grammar - Added grammar rule and token: `MINIMUM_SHOULD_MATCH` and `textMatchOption` in `internal/parser/planparserv2/Plan.g4`. - Regenerated parser outputs: `internal/parser/planparserv2/generated/*` (parser, lexer, visitor, etc.) to support the new rule. - Planner / visitor - `parser_visitor.go`: parse and validate the `minimum_should_match` integer; propagate as an extra value on the `TextMatch` expression so downstream components receive it. - Added `VisitTextMatchOption` visitor method handling. - Client (Golang) - Added a unit test to verify `text_match(..., minimum_should_match=...)` appears in the generated DSL and is accepted by client code: `client/milvusclient/read_test.go` (new test coverage). - Added an integration-style test for the feature to the go-client testcase suite: `tests/go_client/testcases/full_text_search_test.go` (exercise min=1, min=3, large min). - Added an example demonstrating `text_match` usage: `client/milvusclient/read_example_test.go` (example name conforms to godoc mapping). - Engine / index - Updated C++ index interface: `TextMatchIndex::MatchQuery` - Added/updated unit tests for the index behavior: `internal/core/src/index/TextMatchIndexTest.cpp`. - Tantivy binding - Added `match_query_with_minimum` implementation and unit tests to `internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs` that construct boolean queries with minimum required clauses. Behavioral / compatibility notes - This adds an optional argument to `text_match` only; default behavior (no `minimum_should_match`) is unchanged. - Internal API change: `TextMatchIndex::MatchQuery` signature changed (internal component). Callers in the repo were updated accordingly. - Parser changes required regenerating ANTLR outputs Tests and verification - New/updated tests: - Go client unit test: `client/milvusclient/read_test.go` (mocked Search request asserts DSL contains `minimum_should_match=2`). - Go e2e-style test: `tests/go_client/testcases/full_text_search_test.go` (exercises min=1, 3 and a large min). - C++ unit tests for index behavior: `internal/core/src/index/TextMatchIndexTest.cpp`. - Rust binding unit tests for `match_query_with_minimum`. - Local verification commands to run: - Go client tests: `cd client && go test ./milvusclient -run ^$` (client package) - Go testcases: `cd tests/go_client && go test ./testcases -run TestTextMatchMinimumShouldMatch` (requires a running Milvus instance) - C++ unit tests / build: run core build/test per repo instructions (the change touches core index code). - Rust binding tests: `cd internal/core/thirdparty/tantivy/tantivy-binding && cargo test` (if developing locally). --------- Signed-off-by: Amit Kumar <amit.kumar@reddit.com> Co-authored-by: Amit Kumar <amit.kumar@reddit.com>
1128 lines
39 KiB
C++
1128 lines
39 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
#include <gtest/gtest.h>
|
|
#include <optional>
|
|
#include <string>
|
|
|
|
#include "common/Schema.h"
|
|
#include "segcore/SegmentGrowing.h"
|
|
#include "segcore/SegmentGrowingImpl.h"
|
|
#include "test_utils/DataGen.h"
|
|
#include "test_utils/GenExprProto.h"
|
|
#include "query/PlanProto.h"
|
|
#include "query/ExecPlanNodeVisitor.h"
|
|
#include "expr/ITypeExpr.h"
|
|
#include "segcore/segment_c.h"
|
|
#include "test_utils/storage_test_utils.h"
|
|
#include "exec/expression/ExprCache.h"
|
|
|
|
using namespace milvus;
|
|
using namespace milvus::query;
|
|
using namespace milvus::segcore;
|
|
|
|
namespace {
|
|
SchemaPtr
|
|
GenTestSchema(std::map<std::string, std::string> params = {},
|
|
bool nullable = false) {
|
|
auto schema = std::make_shared<Schema>();
|
|
{
|
|
FieldMeta f(FieldName("pk"),
|
|
FieldId(100),
|
|
DataType::INT64,
|
|
false,
|
|
std::nullopt);
|
|
schema->AddField(std::move(f));
|
|
schema->set_primary_field_id(FieldId(100));
|
|
}
|
|
{
|
|
FieldMeta f(FieldName("str"),
|
|
FieldId(101),
|
|
DataType::VARCHAR,
|
|
65536,
|
|
nullable,
|
|
true,
|
|
true,
|
|
params,
|
|
std::nullopt);
|
|
schema->AddField(std::move(f));
|
|
}
|
|
{
|
|
FieldMeta f(FieldName("fvec"),
|
|
FieldId(102),
|
|
DataType::VECTOR_FLOAT,
|
|
16,
|
|
knowhere::metric::L2,
|
|
false,
|
|
std::nullopt);
|
|
schema->AddField(std::move(f));
|
|
}
|
|
return schema;
|
|
}
|
|
std::shared_ptr<milvus::plan::FilterBitsNode>
|
|
GetMatchExpr(SchemaPtr schema,
|
|
const std::string& query,
|
|
proto::plan::OpType op,
|
|
int64_t slop = 0) {
|
|
const auto& str_meta = schema->operator[](FieldName("str"));
|
|
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
|
|
proto::schema::DataType::VarChar,
|
|
false,
|
|
false);
|
|
|
|
auto unary_range_expr = test::GenUnaryRangeExpr(op, query);
|
|
unary_range_expr->set_allocated_column_info(column_info);
|
|
auto generic_for_slop = milvus::test::GenGenericValue(slop);
|
|
unary_range_expr->add_extra_values()->CopyFrom(*generic_for_slop);
|
|
delete generic_for_slop;
|
|
auto expr = test::GenExpr();
|
|
expr->set_allocated_unary_range_expr(unary_range_expr);
|
|
|
|
auto parser = ProtoParser(schema);
|
|
auto typed_expr = parser.ParseExprs(*expr);
|
|
auto parsed =
|
|
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
|
|
return parsed;
|
|
};
|
|
|
|
std::shared_ptr<milvus::plan::FilterBitsNode>
|
|
GetNotMatchExpr(SchemaPtr schema,
|
|
const std::string& query,
|
|
proto::plan::OpType op,
|
|
int64_t slop = 0) {
|
|
const auto& str_meta = schema->operator[](FieldName("str"));
|
|
proto::plan::GenericValue val;
|
|
val.set_string_val(query);
|
|
std::vector<proto::plan::GenericValue> extra_values;
|
|
auto generic_for_slop = milvus::test::GenGenericValue(slop);
|
|
extra_values.push_back(*generic_for_slop);
|
|
delete generic_for_slop;
|
|
auto child_expr = std::make_shared<expr::UnaryRangeFilterExpr>(
|
|
milvus::expr::ColumnInfo(str_meta.get_id(), DataType::VARCHAR),
|
|
op,
|
|
val,
|
|
extra_values);
|
|
auto expr = std::make_shared<expr::LogicalUnaryExpr>(
|
|
expr::LogicalUnaryExpr::OpType::LogicalNot, child_expr);
|
|
auto parsed =
|
|
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, expr);
|
|
return parsed;
|
|
};
|
|
} // namespace
|
|
|
|
TEST(ParseJson, Naive) {
|
|
{
|
|
std::string s(R"({"tokenizer": "jieba"})");
|
|
nlohmann::json j = nlohmann::json::parse(s);
|
|
auto m = j.get<std::map<std::string, std::string>>();
|
|
for (const auto& [k, v] : m) {
|
|
std::cout << k << ": " << v << std::endl;
|
|
}
|
|
}
|
|
|
|
{
|
|
std::string s(
|
|
R"({"analyzer":"stop","stop_words":["an","the"],"case_insensitive":false})");
|
|
nlohmann::json j = nlohmann::json::parse(s);
|
|
for (const auto& [key, value] : j.items()) {
|
|
std::cout << key << ": " << value.dump() << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST(ParseTokenizerParams, NoTokenizerParams) {
|
|
TypeParams params{{"k", "v"}};
|
|
auto p = ParseTokenizerParams(params);
|
|
ASSERT_EQ("{}", std::string(p));
|
|
}
|
|
|
|
TEST(ParseTokenizerParams, Default) {
|
|
TypeParams params{{"analyzer_params", R"({"tokenizer": "standard"})"}};
|
|
auto p = ParseTokenizerParams(params);
|
|
ASSERT_EQ(params.at("analyzer_params"), p);
|
|
}
|
|
|
|
TEST(TextMatch, Index) {
|
|
using Index = index::TextMatchIndex;
|
|
auto index = std::make_unique<Index>(std::numeric_limits<int64_t>::max(),
|
|
"unique_id",
|
|
"milvus_tokenizer",
|
|
"{}");
|
|
index->CreateReader(milvus::index::SetBitsetSealed);
|
|
index->AddTextSealed("football, basketball, pingpang", true, 0);
|
|
index->AddTextSealed("", false, 1);
|
|
index->AddTextSealed("swimming, football", true, 2);
|
|
index->Commit();
|
|
index->Reload();
|
|
|
|
{
|
|
auto res = index->MatchQuery("football", 1);
|
|
ASSERT_EQ(res.size(), 3);
|
|
ASSERT_TRUE(res[0]);
|
|
ASSERT_FALSE(res[1]);
|
|
ASSERT_TRUE(res[2]);
|
|
auto res1 = index->IsNull();
|
|
ASSERT_FALSE(res1[0]);
|
|
ASSERT_TRUE(res1[1]);
|
|
ASSERT_FALSE(res1[2]);
|
|
auto res2 = index->IsNotNull();
|
|
ASSERT_TRUE(res2[0]);
|
|
ASSERT_FALSE(res2[1]);
|
|
ASSERT_TRUE(res2[2]);
|
|
res = index->MatchQuery("nothing", 1);
|
|
ASSERT_EQ(res.size(), 3);
|
|
ASSERT_FALSE(res[0]);
|
|
ASSERT_FALSE(res[1]);
|
|
ASSERT_FALSE(res[2]);
|
|
auto res3 = index->MatchQuery("football pingpang cricket", 2);
|
|
ASSERT_EQ(res3.size(), 3);
|
|
ASSERT_TRUE(res3[0]);
|
|
ASSERT_FALSE(res3[1]);
|
|
ASSERT_FALSE(res3[2]);
|
|
}
|
|
|
|
{
|
|
auto res = index->PhraseMatchQuery("football", 0);
|
|
ASSERT_EQ(res.size(), 3);
|
|
ASSERT_TRUE(res[0]);
|
|
ASSERT_FALSE(res[1]);
|
|
ASSERT_TRUE(res[2]);
|
|
|
|
auto res1 = index->PhraseMatchQuery("swimming football", 0);
|
|
ASSERT_EQ(res1.size(), 3);
|
|
ASSERT_FALSE(res1[0]);
|
|
ASSERT_FALSE(res1[1]);
|
|
ASSERT_TRUE(res1[2]);
|
|
|
|
auto res2 = index->PhraseMatchQuery("football swimming", 0);
|
|
ASSERT_EQ(res2.size(), 3);
|
|
ASSERT_FALSE(res2[0]);
|
|
ASSERT_FALSE(res2[1]);
|
|
ASSERT_FALSE(res2[2]);
|
|
|
|
auto res3 = index->PhraseMatchQuery("football swimming", 1);
|
|
ASSERT_EQ(res3.size(), 3);
|
|
ASSERT_FALSE(res3[0]);
|
|
ASSERT_FALSE(res3[1]);
|
|
ASSERT_FALSE(res3[2]);
|
|
|
|
auto res4 = index->PhraseMatchQuery("football swimming", 2);
|
|
ASSERT_EQ(res4.size(), 3);
|
|
ASSERT_FALSE(res4[0]);
|
|
ASSERT_FALSE(res4[1]);
|
|
ASSERT_TRUE(res4[2]);
|
|
}
|
|
}
|
|
|
|
TEST(TextMatch, GrowingNaive) {
|
|
auto schema = GenTestSchema();
|
|
auto seg = CreateGrowingSegment(schema, empty_index_meta);
|
|
std::vector<std::string> raw_str = {"football, basketball, pingpang",
|
|
"swimming, football"};
|
|
|
|
int64_t N = 2;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
|
|
seg->PreInsert(N);
|
|
seg->Insert(0,
|
|
N,
|
|
raw_data.row_ids_.data(),
|
|
raw_data.timestamps_.data(),
|
|
raw_data.raw_);
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "football", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "football", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "swimming", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "swimming", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
auto expr =
|
|
GetMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
|
|
BitsetType final;
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 =
|
|
GetNotMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
|
|
auto expr2 =
|
|
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch);
|
|
final = ExecuteQueryExpr(expr2, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
auto expr3 =
|
|
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch, 1);
|
|
final = ExecuteQueryExpr(expr3, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
TEST(TextMatch, GrowingNaiveNullable) {
|
|
auto schema = GenTestSchema({}, true);
|
|
auto seg = CreateGrowingSegment(schema, empty_index_meta);
|
|
std::vector<std::string> raw_str = {
|
|
"football, basketball, pingpang", "swimming, football", ""};
|
|
std::vector<bool> raw_str_valid = {true, true, false};
|
|
|
|
int64_t N = 3;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
auto str_col_valid =
|
|
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col_valid->at(i) = raw_str_valid[i];
|
|
}
|
|
|
|
seg->PreInsert(N);
|
|
seg->Insert(0,
|
|
N,
|
|
raw_data.row_ids_.data(),
|
|
raw_data.timestamps_.data(),
|
|
raw_data.raw_);
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "football", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "football", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "swimming", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "swimming", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
auto expr =
|
|
GetMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
|
|
BitsetType final;
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 =
|
|
GetNotMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
|
|
auto expr2 =
|
|
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch);
|
|
final = ExecuteQueryExpr(expr2, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr3 =
|
|
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch, 1);
|
|
final = ExecuteQueryExpr(expr3, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
TEST(TextMatch, SealedNaive) {
|
|
auto schema = GenTestSchema();
|
|
std::vector<std::string> raw_str = {"football, basketball, pingpang",
|
|
"swimming, football"};
|
|
|
|
int64_t N = 2;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
|
|
auto seg = CreateSealedWithFieldDataLoaded(schema, raw_data);
|
|
seg->CreateTextIndex(FieldId(101));
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "football", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "football", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "swimming", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "swimming", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
auto expr =
|
|
GetMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
|
|
BitsetType final;
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 =
|
|
GetNotMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
|
|
auto expr2 =
|
|
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch);
|
|
final = ExecuteQueryExpr(expr2, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
auto expr3 =
|
|
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch, 1);
|
|
final = ExecuteQueryExpr(expr3, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
TEST(TextMatch, SealedNaiveNullable) {
|
|
auto schema = GenTestSchema({}, true);
|
|
std::vector<std::string> raw_str = {
|
|
"football, basketball, pingpang", "swimming, football", ""};
|
|
std::vector<bool> raw_str_valid = {true, true, false};
|
|
|
|
int64_t N = 3;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
auto str_col_valid =
|
|
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col_valid->at(i) = raw_str_valid[i];
|
|
}
|
|
|
|
auto seg = CreateSealedWithFieldDataLoaded(schema, raw_data);
|
|
seg->CreateTextIndex(FieldId(101));
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "football", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "football", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "swimming", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "swimming", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
auto expr =
|
|
GetMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
|
|
BitsetType final;
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 =
|
|
GetNotMatchExpr(schema, "basketball, swimming", OpType::TextMatch);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
|
|
auto expr2 =
|
|
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch);
|
|
final = ExecuteQueryExpr(expr2, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr3 =
|
|
GetMatchExpr(schema, "football, pingpang", OpType::PhraseMatch, 1);
|
|
final = ExecuteQueryExpr(expr3, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
TEST(TextMatch, GrowingJieBa) {
|
|
auto schema = GenTestSchema({
|
|
{"enable_match", "true"},
|
|
{"enable_analyzer", "true"},
|
|
{"analyzer_params", R"({"tokenizer": "jieba"})"},
|
|
});
|
|
auto seg = CreateGrowingSegment(schema, empty_index_meta);
|
|
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
|
|
|
|
int64_t N = 2;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
|
|
seg->PreInsert(N);
|
|
seg->Insert(0,
|
|
N,
|
|
raw_data.row_ids_.data(),
|
|
raw_data.timestamps_.data(),
|
|
raw_data.raw_);
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "青铜", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "青铜", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "黄金", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "黄金", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "时代", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "时代", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
auto expr = GetMatchExpr(schema, "黄金时代", OpType::PhraseMatch);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
}
|
|
}
|
|
|
|
TEST(TextMatch, GrowingJieBaNullable) {
|
|
auto schema = GenTestSchema(
|
|
{
|
|
{"enable_match", "true"},
|
|
{"enable_tokenizer", "true"},
|
|
{"analyzer_params", R"({"tokenizer": "jieba"})"},
|
|
},
|
|
true);
|
|
auto seg = CreateGrowingSegment(schema, empty_index_meta);
|
|
std::vector<std::string> raw_str = {"青铜时代", "黄金时代", ""};
|
|
std::vector<bool> raw_str_valid = {true, true, false};
|
|
|
|
int64_t N = 3;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
auto str_col_valid =
|
|
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col_valid->at(i) = raw_str_valid[i];
|
|
}
|
|
|
|
seg->PreInsert(N);
|
|
seg->Insert(0,
|
|
N,
|
|
raw_data.row_ids_.data(),
|
|
raw_data.timestamps_.data(),
|
|
raw_data.raw_);
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "青铜", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "青铜", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "黄金", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "黄金", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "时代", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "时代", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
auto expr = GetMatchExpr(schema, "黄金时代", OpType::PhraseMatch);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
TEST(TextMatch, SealedJieBa) {
|
|
auto schema = GenTestSchema({
|
|
{"enable_match", "true"},
|
|
{"enable_analyzer", "true"},
|
|
{"analyzer_params", R"({"tokenizer": "jieba"})"},
|
|
});
|
|
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
|
|
|
|
int64_t N = 2;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
|
|
auto seg = CreateSealedWithFieldDataLoaded(schema, raw_data);
|
|
seg->CreateTextIndex(FieldId(101));
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "青铜", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "青铜", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "黄金", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "黄金", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "时代", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
auto expr1 = GetNotMatchExpr(schema, "时代", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
auto expr = GetMatchExpr(schema, "黄金时代", OpType::PhraseMatch);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
}
|
|
}
|
|
|
|
TEST(TextMatch, SealedJieBaNullable) {
|
|
auto schema = GenTestSchema(
|
|
{
|
|
{"enable_match", "true"},
|
|
{"enable_tokenizer", "true"},
|
|
{"analyzer_params", R"({"tokenizer": "jieba"})"},
|
|
},
|
|
true);
|
|
std::vector<std::string> raw_str = {"青铜时代", "黄金时代", ""};
|
|
std::vector<bool> raw_str_valid = {true, true, false};
|
|
|
|
int64_t N = 3;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
auto str_col_valid =
|
|
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col_valid->at(i) = raw_str_valid[i];
|
|
}
|
|
|
|
auto seg = CreateSealedWithFieldDataLoaded(schema, raw_data);
|
|
seg->CreateTextIndex(FieldId(101));
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "青铜", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "青铜", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "黄金", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "黄金", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
for (auto op : {OpType::TextMatch, OpType::PhraseMatch}) {
|
|
auto expr = GetMatchExpr(schema, "时代", op);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
auto expr1 = GetNotMatchExpr(schema, "时代", op);
|
|
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
{
|
|
BitsetType final;
|
|
auto expr = GetMatchExpr(schema, "黄金时代", OpType::PhraseMatch);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_FALSE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
}
|
|
}
|
|
|
|
// Test that growing segment loading flushed binlogs will build text match index.
|
|
TEST(TextMatch, GrowingLoadData) {
|
|
int64_t N = 7;
|
|
auto schema = GenTestSchema({}, true);
|
|
schema->AddField(
|
|
FieldName("RowID"), FieldId(0), DataType::INT64, false, std::nullopt);
|
|
schema->AddField(FieldName("Timestamp"),
|
|
FieldId(1),
|
|
DataType::INT64,
|
|
false,
|
|
std::nullopt);
|
|
std::vector<std::string> raw_str = {"football, basketball, pingpang",
|
|
"swimming, football",
|
|
"golf",
|
|
"",
|
|
"baseball",
|
|
"kungfu, football",
|
|
""};
|
|
auto raw_data = DataGen(schema, N);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
auto str_col_valid =
|
|
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col_valid->at(i) = true;
|
|
}
|
|
// so we cannot match the second row
|
|
str_col_valid->at(1) = false;
|
|
|
|
auto storage_config = get_default_local_storage_config();
|
|
auto cm = storage::CreateChunkManager(storage_config);
|
|
auto load_info = PrepareInsertBinlog(1, 2, 3, raw_data, cm);
|
|
|
|
auto segment = CreateGrowingSegment(schema, empty_index_meta);
|
|
auto status = LoadFieldData(segment.get(), &load_info);
|
|
ASSERT_EQ(status.error_code, Success);
|
|
ASSERT_EQ(segment->get_real_count(), N);
|
|
ASSERT_NE(segment->get_field_avg_size(FieldId(101)), 0);
|
|
|
|
// Check whether the text index has been built.
|
|
auto expr = GetMatchExpr(schema, "football", OpType::TextMatch);
|
|
BitsetType final;
|
|
final = ExecuteQueryExpr(expr, segment.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_FALSE(final[1]);
|
|
ASSERT_FALSE(final[2]);
|
|
ASSERT_FALSE(final[3]);
|
|
ASSERT_FALSE(final[4]);
|
|
ASSERT_TRUE(final[5]);
|
|
ASSERT_FALSE(final[6]);
|
|
}
|
|
|
|
TEST(TextMatch, ConcurrentReadWriteWithNull) {
|
|
auto schema = GenTestSchema({}, true);
|
|
auto seg = CreateGrowingSegment(schema, empty_index_meta);
|
|
int64_t N = 1000;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col_valid =
|
|
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N - 1; i++) {
|
|
str_col->at(i) = "";
|
|
}
|
|
str_col->at(N - 1) = "football";
|
|
for (int64_t i = 0; i < N - 1; i++) {
|
|
str_col_valid->at(i) = false;
|
|
}
|
|
|
|
std::thread writer([&seg, &raw_data, N]() {
|
|
seg->PreInsert(N);
|
|
seg->Insert(0,
|
|
N,
|
|
raw_data.row_ids_.data(),
|
|
raw_data.timestamps_.data(),
|
|
raw_data.raw_);
|
|
});
|
|
|
|
std::thread reader([&seg, N]() {
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
;
|
|
const std::chrono::seconds timeout_duration{2};
|
|
while (true) {
|
|
if (start - std::chrono::high_resolution_clock::now() >
|
|
timeout_duration) {
|
|
ASSERT_TRUE(false)
|
|
<< "Failed to get valid results within timeout";
|
|
break;
|
|
}
|
|
BitsetType final;
|
|
auto expr =
|
|
GetMatchExpr(GenTestSchema(), "football", OpType::TextMatch);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
if (final.size() != N || !final[N - 1]) {
|
|
continue;
|
|
}
|
|
for (int64_t i = 0; i < N - 1; i++) {
|
|
ASSERT_FALSE(final[i]);
|
|
}
|
|
ASSERT_TRUE(final[N - 1]);
|
|
break;
|
|
}
|
|
});
|
|
|
|
writer.join();
|
|
reader.join();
|
|
}
|
|
|
|
TEST(TextMatch, ExprResCacheSealed) {
|
|
using milvus::exec::ExprResCacheManager;
|
|
auto& mgr = ExprResCacheManager::Instance();
|
|
ExprResCacheManager::SetEnabled(true);
|
|
mgr.Clear();
|
|
mgr.SetCapacityBytes(1ULL << 20);
|
|
|
|
auto schema = GenTestSchema();
|
|
std::vector<std::string> raw_str = {"football, basketball, pingpang",
|
|
"swimming, football"};
|
|
|
|
int64_t N = 2;
|
|
uint64_t seed = 19190504;
|
|
auto raw_data = DataGen(schema, N, seed);
|
|
auto str_col = raw_data.raw_->mutable_fields_data()
|
|
->at(1)
|
|
.mutable_scalars()
|
|
->mutable_string_data()
|
|
->mutable_data();
|
|
for (int64_t i = 0; i < N; i++) {
|
|
str_col->at(i) = raw_str[i];
|
|
}
|
|
|
|
auto seg = CreateSealedWithFieldDataLoaded(schema, raw_data);
|
|
seg->CreateTextIndex(FieldId(101));
|
|
|
|
ASSERT_EQ(mgr.GetEntryCount(), 0);
|
|
|
|
BitsetType final;
|
|
auto expr = GetMatchExpr(schema, "football", OpType::TextMatch);
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
|
|
// Expect one cache entry inserted
|
|
ASSERT_EQ(mgr.GetEntryCount(), 1);
|
|
|
|
// Run again; should hit cache and not increase entries
|
|
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
|
|
ASSERT_EQ(final.size(), N);
|
|
ASSERT_TRUE(final[0]);
|
|
ASSERT_TRUE(final[1]);
|
|
ASSERT_EQ(mgr.GetEntryCount(), 1);
|
|
|
|
// Cleanup
|
|
mgr.Clear();
|
|
ExprResCacheManager::SetEnabled(false);
|
|
}
|