mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 17:48:29 +08:00
test: supplementing case for text match (#36693)
/kind improvement Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
parent
7774b7275e
commit
c8dd665bf6
@ -117,7 +117,7 @@ class ResponseChecker:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def assert_exception(self, res, actual=True, error_dict=None):
|
def assert_exception(self, res, actual=True, error_dict=None):
|
||||||
assert actual is False
|
assert actual is False, f"Response of API {self.func_name} expect get error, but success"
|
||||||
assert len(error_dict) > 0
|
assert len(error_dict) > 0
|
||||||
if isinstance(res, Error):
|
if isinstance(res, Error):
|
||||||
error_code = error_dict[ct.err_code]
|
error_code = error_dict[ct.err_code]
|
||||||
|
|||||||
@ -118,8 +118,6 @@ def get_bm25_ground_truth(corpus, queries, top_k=100, language="en"):
|
|||||||
return results, scores
|
return results, scores
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def custom_tokenizer(language="en"):
|
def custom_tokenizer(language="en"):
|
||||||
def remove_punctuation(text):
|
def remove_punctuation(text):
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
@ -153,11 +151,12 @@ def custom_tokenizer(language="en"):
|
|||||||
def analyze_documents(texts, language="en"):
|
def analyze_documents(texts, language="en"):
|
||||||
|
|
||||||
tokenizer = custom_tokenizer(language)
|
tokenizer = custom_tokenizer(language)
|
||||||
# Start timing
|
new_texts = []
|
||||||
t0 = time.time()
|
for text in texts:
|
||||||
|
if isinstance(text, str):
|
||||||
|
new_texts.append(text)
|
||||||
# Tokenize the corpus
|
# Tokenize the corpus
|
||||||
tokenized = tokenizer.tokenize(texts, return_as="tuple")
|
tokenized = tokenizer.tokenize(new_texts, return_as="tuple")
|
||||||
# log.info(f"Tokenized: {tokenized}")
|
# log.info(f"Tokenized: {tokenized}")
|
||||||
# Create a frequency counter
|
# Create a frequency counter
|
||||||
freq = Counter()
|
freq = Counter()
|
||||||
@ -170,13 +169,11 @@ def analyze_documents(texts, language="en"):
|
|||||||
|
|
||||||
# Convert token ids back to words
|
# Convert token ids back to words
|
||||||
word_freq = Counter({id_to_word[token_id]: count for token_id, count in freq.items()})
|
word_freq = Counter({id_to_word[token_id]: count for token_id, count in freq.items()})
|
||||||
|
log.debug(f"word freq {word_freq.most_common(10)}")
|
||||||
# End timing
|
|
||||||
tt = time.time() - t0
|
|
||||||
log.debug(f"Analyze document cost time: {tt}")
|
|
||||||
|
|
||||||
return word_freq
|
return word_freq
|
||||||
|
|
||||||
|
|
||||||
def check_token_overlap(text_a, text_b, language="en"):
|
def check_token_overlap(text_a, text_b, language="en"):
|
||||||
word_freq_a = analyze_documents([text_a], language)
|
word_freq_a = analyze_documents([text_a], language)
|
||||||
word_freq_b = analyze_documents([text_b], language)
|
word_freq_b = analyze_documents([text_b], language)
|
||||||
@ -3054,7 +3051,7 @@ def gen_sparse_vectors(nb, dim=1000, sparse_format="dok"):
|
|||||||
|
|
||||||
rng = np.random.default_rng()
|
rng = np.random.default_rng()
|
||||||
vectors = [{
|
vectors = [{
|
||||||
d: rng.random() for d in random.sample(range(dim), random.randint(20, 30))
|
d: rng.random() for d in list(set(random.sample(range(dim), random.randint(20, 30)) + [0, 1]))
|
||||||
} for _ in range(nb)]
|
} for _ in range(nb)]
|
||||||
if sparse_format == "coo":
|
if sparse_format == "coo":
|
||||||
vectors = [
|
vectors = [
|
||||||
|
|||||||
@ -5206,6 +5206,120 @@ class TestQueryTextMatch(TestcaseBase):
|
|||||||
log.info(f"res len {len(res)}")
|
log.info(f"res len {len(res)}")
|
||||||
assert len(res) == wf_map[field].most_common()[-1][1]
|
assert len(res) == wf_map[field].most_common()[-1][1]
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L1)
|
||||||
|
@pytest.mark.parametrize("combine_op", ["and", "or"])
|
||||||
|
def test_query_text_match_with_non_varchar_fields_expr(self, combine_op):
|
||||||
|
"""
|
||||||
|
target: test text match with non-varchar fields expr
|
||||||
|
method: 1. enable text match for varchar field and add some non varchar fields
|
||||||
|
2. insert data, create index and load
|
||||||
|
3. query with text match expr and non-varchar fields expr
|
||||||
|
4. verify the result
|
||||||
|
expected: query result is correct
|
||||||
|
"""
|
||||||
|
# 1. initialize with data
|
||||||
|
fake_en = Faker("en_US")
|
||||||
|
analyzer_params = {
|
||||||
|
"tokenizer": "default",
|
||||||
|
}
|
||||||
|
dim = 128
|
||||||
|
default_fields = [
|
||||||
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||||
|
FieldSchema(
|
||||||
|
name="age",
|
||||||
|
dtype=DataType.INT64,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="word",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="sentence",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="paragraph",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="text",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
),
|
||||||
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||||
|
]
|
||||||
|
default_schema = CollectionSchema(
|
||||||
|
fields=default_fields, description="test collection"
|
||||||
|
)
|
||||||
|
|
||||||
|
collection_w = self.init_collection_wrap(
|
||||||
|
name=cf.gen_unique_str(prefix), schema=default_schema
|
||||||
|
)
|
||||||
|
data = []
|
||||||
|
data_size = 10000
|
||||||
|
for i in range(data_size):
|
||||||
|
d = {
|
||||||
|
"id": i,
|
||||||
|
"age": random.randint(1, 100),
|
||||||
|
"word": fake_en.word().lower(),
|
||||||
|
"sentence": fake_en.sentence().lower(),
|
||||||
|
"paragraph": fake_en.paragraph().lower(),
|
||||||
|
"text": fake_en.text().lower(),
|
||||||
|
"emb": cf.gen_vectors(1, dim)[0],
|
||||||
|
}
|
||||||
|
data.append(d)
|
||||||
|
batch_size = 5000
|
||||||
|
for i in range(0, data_size, batch_size):
|
||||||
|
collection_w.insert(
|
||||||
|
data[i : i + batch_size]
|
||||||
|
if i + batch_size < data_size
|
||||||
|
else data[i:data_size]
|
||||||
|
)
|
||||||
|
collection_w.create_index(
|
||||||
|
"emb",
|
||||||
|
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
|
||||||
|
)
|
||||||
|
collection_w.create_index("word", {"index_type": "INVERTED"})
|
||||||
|
collection_w.load()
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
log.info(f"dataframe\n{df}")
|
||||||
|
text_fields = ["word", "sentence", "paragraph", "text"]
|
||||||
|
wf_map = {}
|
||||||
|
for field in text_fields:
|
||||||
|
wf_map[field] = cf.analyze_documents(df[field].tolist(), language="en")
|
||||||
|
# query single field for one word
|
||||||
|
for field in text_fields:
|
||||||
|
token = wf_map[field].most_common()[0][0]
|
||||||
|
tm_expr = f"TextMatch({field}, '{token}')"
|
||||||
|
int_expr = "age > 10"
|
||||||
|
combined_expr = f"{tm_expr} {combine_op} {int_expr}"
|
||||||
|
log.info(f"expr: {combined_expr}")
|
||||||
|
res, _ = collection_w.query(expr=combined_expr, output_fields=["id", field, "age"])
|
||||||
|
log.info(f"res len {len(res)}")
|
||||||
|
for r in res:
|
||||||
|
if combine_op == "and":
|
||||||
|
assert token in r[field] and r["age"] > 10
|
||||||
|
if combine_op == "or":
|
||||||
|
assert token in r[field] or r["age"] > 10
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.tags(CaseLabel.L1)
|
@pytest.mark.tags(CaseLabel.L1)
|
||||||
def test_query_text_match_with_some_empty_string(self):
|
def test_query_text_match_with_some_empty_string(self):
|
||||||
"""
|
"""
|
||||||
@ -5347,6 +5461,124 @@ class TestQueryTextMatch(TestcaseBase):
|
|||||||
for r in res:
|
for r in res:
|
||||||
assert any([token in r[field] for token in multi_words])
|
assert any([token in r[field] for token in multi_words])
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L1)
|
||||||
|
def test_query_text_match_with_nullable(self):
|
||||||
|
"""
|
||||||
|
target: test text match with nullable
|
||||||
|
method: 1. enable text match and nullable, and insert data with varchar with some None value
|
||||||
|
2. get the most common words and query with text match
|
||||||
|
3. verify the result
|
||||||
|
expected: text match successfully and result is correct
|
||||||
|
"""
|
||||||
|
# 1. initialize with data
|
||||||
|
analyzer_params = {
|
||||||
|
"tokenizer": "default",
|
||||||
|
}
|
||||||
|
# 1. initialize with data
|
||||||
|
dim = 128
|
||||||
|
fields = [
|
||||||
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||||
|
FieldSchema(
|
||||||
|
name="word",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="sentence",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="paragraph",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="text",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||||
|
]
|
||||||
|
schema = CollectionSchema(fields=fields, description="test collection")
|
||||||
|
data_size = 5000
|
||||||
|
collection_w = self.init_collection_wrap(
|
||||||
|
name=cf.gen_unique_str(prefix), schema=schema
|
||||||
|
)
|
||||||
|
fake = fake_en
|
||||||
|
language = "en"
|
||||||
|
data_null = [
|
||||||
|
{
|
||||||
|
"id": i,
|
||||||
|
"word": None if random.random() < 0.9 else fake.word().lower(),
|
||||||
|
"sentence": None if random.random() < 0.9 else fake.sentence().lower(),
|
||||||
|
"paragraph": None if random.random() < 0.9 else fake.paragraph().lower(),
|
||||||
|
"text": None if random.random() < 0.9 else fake.paragraph().lower(),
|
||||||
|
"emb": [random.random() for _ in range(dim)],
|
||||||
|
}
|
||||||
|
for i in range(0, data_size)
|
||||||
|
]
|
||||||
|
data = data_null
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
log.info(f"dataframe\n{df}")
|
||||||
|
batch_size = 5000
|
||||||
|
for i in range(0, len(df), batch_size):
|
||||||
|
collection_w.insert(
|
||||||
|
data[i:i + batch_size]
|
||||||
|
if i + batch_size < len(df)
|
||||||
|
else data[i:len(df)]
|
||||||
|
)
|
||||||
|
collection_w.flush()
|
||||||
|
collection_w.create_index(
|
||||||
|
"emb",
|
||||||
|
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
|
||||||
|
)
|
||||||
|
collection_w.load()
|
||||||
|
text_fields = ["word", "sentence", "paragraph", "text"]
|
||||||
|
wf_map = {}
|
||||||
|
for field in text_fields:
|
||||||
|
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
|
||||||
|
# query single field for one word
|
||||||
|
for field in text_fields:
|
||||||
|
token = wf_map[field].most_common()[-1][0]
|
||||||
|
expr = f"TextMatch({field}, '{token}')"
|
||||||
|
log.info(f"expr: {expr}")
|
||||||
|
res, _ = collection_w.query(expr=expr, output_fields=text_fields)
|
||||||
|
log.info(f"res len {len(res)}, \n{res}")
|
||||||
|
assert len(res) > 0
|
||||||
|
for r in res:
|
||||||
|
assert token in r[field]
|
||||||
|
# query single field for multi-word
|
||||||
|
for field in text_fields:
|
||||||
|
# match top 3 most common words
|
||||||
|
multi_words = []
|
||||||
|
for word, count in wf_map[field].most_common(3):
|
||||||
|
multi_words.append(word)
|
||||||
|
string_of_multi_words = " ".join(multi_words)
|
||||||
|
expr = f"TextMatch({field}, '{string_of_multi_words}')"
|
||||||
|
log.info(f"expr {expr}")
|
||||||
|
res, _ = collection_w.query(expr=expr, output_fields=text_fields)
|
||||||
|
log.info(f"res len {len(res)}, {res}")
|
||||||
|
assert len(res) > 0
|
||||||
|
for r in res:
|
||||||
|
assert any([token in r[field] for token in multi_words])
|
||||||
|
|
||||||
|
|
||||||
class TestQueryTextMatchNegative(TestcaseBase):
|
class TestQueryTextMatchNegative(TestcaseBase):
|
||||||
@pytest.mark.tags(CaseLabel.L0)
|
@pytest.mark.tags(CaseLabel.L0)
|
||||||
@ -5471,3 +5703,56 @@ class TestQueryFunction(TestcaseBase):
|
|||||||
collection_w.query(
|
collection_w.query(
|
||||||
call_expr, check_task=CheckTasks.err_res, check_items=error
|
call_expr, check_task=CheckTasks.err_res, check_items=error
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L0)
|
||||||
|
@pytest.mark.xfail(reason="issue 36685")
|
||||||
|
def test_query_text_match_with_unsupported_fields(self):
|
||||||
|
"""
|
||||||
|
target: test enable text match with unsupported field
|
||||||
|
method: 1. enable text match in unsupported field
|
||||||
|
2. create collection
|
||||||
|
expected: create collection failed and return error
|
||||||
|
"""
|
||||||
|
analyzer_params = {
|
||||||
|
"tokenizer": "default",
|
||||||
|
}
|
||||||
|
dim = 128
|
||||||
|
default_fields = [
|
||||||
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||||
|
FieldSchema(
|
||||||
|
name="title",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="overview",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
),
|
||||||
|
FieldSchema(
|
||||||
|
name="age",
|
||||||
|
dtype=DataType.INT64,
|
||||||
|
enable_tokenizer=True,
|
||||||
|
enable_match=True,
|
||||||
|
analyzer_params=analyzer_params,
|
||||||
|
),
|
||||||
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||||
|
]
|
||||||
|
default_schema = CollectionSchema(
|
||||||
|
fields=default_fields, description="test collection"
|
||||||
|
)
|
||||||
|
error = {ct.err_code: 2000, ct.err_msg: "field type is not supported"}
|
||||||
|
self.init_collection_wrap(
|
||||||
|
name=cf.gen_unique_str(prefix),
|
||||||
|
schema=default_schema,
|
||||||
|
check_task=CheckTasks.err_res,
|
||||||
|
check_items=error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -13335,13 +13335,15 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
|||||||
enable_match=True,
|
enable_match=True,
|
||||||
tokenizer_params=tokenizer_params,
|
tokenizer_params=tokenizer_params,
|
||||||
),
|
),
|
||||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||||
|
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
||||||
]
|
]
|
||||||
schema = CollectionSchema(fields=fields, description="test collection")
|
schema = CollectionSchema(fields=fields, description="test collection")
|
||||||
data_size = 5000
|
data_size = 5000
|
||||||
collection_w = self.init_collection_wrap(
|
collection_w = self.init_collection_wrap(
|
||||||
name=cf.gen_unique_str(prefix), schema=schema
|
name=cf.gen_unique_str(prefix), schema=schema
|
||||||
)
|
)
|
||||||
|
log.info(f"collection {collection_w.describe()}")
|
||||||
fake = fake_en
|
fake = fake_en
|
||||||
if tokenizer == "jieba":
|
if tokenizer == "jieba":
|
||||||
language = "zh"
|
language = "zh"
|
||||||
@ -13356,7 +13358,8 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
|||||||
"sentence": fake.sentence().lower(),
|
"sentence": fake.sentence().lower(),
|
||||||
"paragraph": fake.paragraph().lower(),
|
"paragraph": fake.paragraph().lower(),
|
||||||
"text": fake.text().lower(),
|
"text": fake.text().lower(),
|
||||||
"emb": [random.random() for _ in range(dim)],
|
"float32_emb": [random.random() for _ in range(dim)],
|
||||||
|
"sparse_emb": cf.gen_sparse_vectors(1, dim=10000)[0],
|
||||||
}
|
}
|
||||||
for i in range(data_size)
|
for i in range(data_size)
|
||||||
]
|
]
|
||||||
@ -13371,9 +13374,13 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
|||||||
)
|
)
|
||||||
collection_w.flush()
|
collection_w.flush()
|
||||||
collection_w.create_index(
|
collection_w.create_index(
|
||||||
"emb",
|
"float32_emb",
|
||||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||||
)
|
)
|
||||||
|
collection_w.create_index(
|
||||||
|
"sparse_emb",
|
||||||
|
{"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"},
|
||||||
|
)
|
||||||
if enable_inverted_index:
|
if enable_inverted_index:
|
||||||
collection_w.create_index("word", {"index_type": "INVERTED"})
|
collection_w.create_index("word", {"index_type": "INVERTED"})
|
||||||
collection_w.load()
|
collection_w.load()
|
||||||
@ -13382,47 +13389,56 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
|||||||
wf_map = {}
|
wf_map = {}
|
||||||
for field in text_fields:
|
for field in text_fields:
|
||||||
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
|
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
|
||||||
# query single field for one token
|
# search with filter single field for one token
|
||||||
df_split = cf.split_dataframes(df, text_fields, language=language)
|
df_split = cf.split_dataframes(df, text_fields, language=language)
|
||||||
log.info(f"df_split\n{df_split}")
|
log.info(f"df_split\n{df_split}")
|
||||||
for field in text_fields:
|
for ann_field in ["float32_emb", "sparse_emb"]:
|
||||||
token = wf_map[field].most_common()[0][0]
|
log.info(f"ann_field {ann_field}")
|
||||||
expr = f"TextMatch({field}, '{token}')"
|
if ann_field == "float32_emb":
|
||||||
manual_result = df_split[
|
search_data = [[random.random() for _ in range(dim)]]
|
||||||
df_split.apply(lambda row: token in row[field], axis=1)
|
elif ann_field == "sparse_emb":
|
||||||
]
|
search_data = cf.gen_sparse_vectors(1,dim=10000)
|
||||||
log.info(f"expr: {expr}, manual_check_result\n: {manual_result}")
|
else:
|
||||||
res_list, _ = collection_w.search(
|
search_data = [[random.random() for _ in range(dim)]]
|
||||||
data=[[random.random() for _ in range(dim)]],
|
for field in text_fields:
|
||||||
anns_field="emb",
|
token = wf_map[field].most_common()[0][0]
|
||||||
param={},
|
expr = f"TextMatch({field}, '{token}')"
|
||||||
limit=100,
|
manual_result = df_split[
|
||||||
expr=expr, output_fields=["id", field])
|
df_split.apply(lambda row: token in row[field], axis=1)
|
||||||
for res in res_list:
|
]
|
||||||
assert len(res) > 0
|
log.info(f"expr: {expr}, manual_check_result: {len(manual_result)}")
|
||||||
log.info(f"res len {len(res)} res {res}")
|
res_list, _ = collection_w.search(
|
||||||
for r in res:
|
data=search_data,
|
||||||
r = r.to_dict()
|
anns_field=ann_field,
|
||||||
assert token in r["entity"][field]
|
param={},
|
||||||
|
limit=100,
|
||||||
|
expr=expr, output_fields=["id", field])
|
||||||
|
for res in res_list:
|
||||||
|
log.info(f"res len {len(res)} res {res}")
|
||||||
|
assert len(res) > 0
|
||||||
|
for r in res:
|
||||||
|
r = r.to_dict()
|
||||||
|
assert token in r["entity"][field]
|
||||||
|
|
||||||
# query single field for multi-word
|
# search with filter single field for multi-token
|
||||||
for field in text_fields:
|
for field in text_fields:
|
||||||
# match top 10 most common words
|
# match top 10 most common words
|
||||||
top_10_tokens = []
|
top_10_tokens = []
|
||||||
for word, count in wf_map[field].most_common(10):
|
for word, count in wf_map[field].most_common(10):
|
||||||
top_10_tokens.append(word)
|
top_10_tokens.append(word)
|
||||||
string_of_top_10_words = " ".join(top_10_tokens)
|
string_of_top_10_words = " ".join(top_10_tokens)
|
||||||
expr = f"TextMatch({field}, '{string_of_top_10_words}')"
|
expr = f"TextMatch({field}, '{string_of_top_10_words}')"
|
||||||
log.info(f"expr {expr}")
|
log.info(f"expr {expr}")
|
||||||
res_list, _ = collection_w.search(
|
res_list, _ = collection_w.search(
|
||||||
data=[[random.random() for _ in range(dim)]],
|
data=search_data,
|
||||||
anns_field="emb",
|
anns_field=ann_field,
|
||||||
param={},
|
param={},
|
||||||
limit=100,
|
limit=100,
|
||||||
expr=expr, output_fields=["id", field])
|
expr=expr, output_fields=["id", field])
|
||||||
for res in res_list:
|
for res in res_list:
|
||||||
log.info(f"res len {len(res)} res {res}")
|
log.info(f"res len {len(res)} res {res}")
|
||||||
for r in res:
|
assert len(res) > 0
|
||||||
r = r.to_dict()
|
for r in res:
|
||||||
assert any([token in r["entity"][field] for token in top_10_tokens])
|
r = r.to_dict()
|
||||||
|
assert any([token in r["entity"][field] for token in top_10_tokens])
|
||||||
|
|
||||||
|
|||||||
@ -450,7 +450,7 @@ class TestCreateCollection(TestBase):
|
|||||||
indexes = rsp['data']['indexes']
|
indexes = rsp['data']['indexes']
|
||||||
assert len(indexes) == len(payload['indexParams'])
|
assert len(indexes) == len(payload['indexParams'])
|
||||||
# assert load success
|
# assert load success
|
||||||
assert rsp['data']['load'] == "LoadStateLoaded"
|
assert rsp['data']['load'] in ["LoadStateLoaded", "LoadStateLoading"]
|
||||||
|
|
||||||
@pytest.mark.parametrize("auto_id", [True])
|
@pytest.mark.parametrize("auto_id", [True])
|
||||||
@pytest.mark.parametrize("enable_dynamic_field", [True])
|
@pytest.mark.parametrize("enable_dynamic_field", [True])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user