enhance: [2.5]support use nullable field as bm25 function input field (#44586) (#45118)

relate: https://github.com/milvus-io/milvus/pull/44586

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2025-10-28 19:20:11 +08:00 committed by GitHub
parent 78d70db6fd
commit 529a31a1bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 12 additions and 53 deletions

View File

@ -833,9 +833,6 @@ func validateFunction(coll *schemapb.CollectionSchema) error {
if !ok {
return fmt.Errorf("function input field not found: %s", name)
}
if inputField.GetNullable() {
return fmt.Errorf("function input field cannot be nullable: function %s, field %s", function.GetName(), inputField.GetName())
}
inputFields = append(inputFields, inputField)
}
@ -904,7 +901,6 @@ func checkFunctionInputField(function *schemapb.FunctionSchema, fields []*schema
if !h.EnableAnalyzer() {
return errors.New("BM25 function input field must set enable_analyzer to true")
}
default:
return errors.New("check input field with unknown function type")
}

View File

@ -2760,7 +2760,7 @@ func TestValidateFunction(t *testing.T) {
assert.Contains(t, err.Error(), "output field not found")
})
t.Run("Invalid function schema - nullable input field", func(t *testing.T) {
t.Run("Valid function schema - nullable input field", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}, Nullable: true},
@ -2776,8 +2776,7 @@ func TestValidateFunction(t *testing.T) {
},
}
err := validateFunction(schema)
assert.Error(t, err)
assert.Contains(t, err.Error(), "function input field cannot be nullable")
assert.NoError(t, err)
})
t.Run("Invalid function schema - output field is primary key", func(t *testing.T) {

View File

@ -111,6 +111,11 @@ func (v *BM25FunctionRunner) run(data []string, dst []map[uint32]float32) error
defer tokenizer.Destroy()
for i := 0; i < len(data); i++ {
if len(data[i]) == 0 {
dst[i] = map[uint32]float32{}
continue
}
if !typeutil.IsUTF8(data[i]) {
return merr.WrapErrParameterInvalidMsg("string data must be utf8 format: %v", data[i])
}

View File

@ -155,6 +155,11 @@ func (v *MultiAnalyzerBM25FunctionRunner) run(text []string, analyzerName []stri
}()
for i := 0; i < len(text); i++ {
if len(text[i]) == 0 {
dst[i] = map[uint32]float32{}
continue
}
if !typeutil.IsUTF8(text[i]) {
return merr.WrapErrParameterInvalidMsg("string data must be utf8 format: %v", text[i])
}

View File

@ -4329,52 +4329,6 @@ class TestFullTextSearchMultiAnalyzerInvalid(TestcaseBase):
assert len(results) == 1
assert len(results[0]) > 0
@pytest.mark.tags(CaseLabel.L0)
def test_text_field_is_nullable(self):
"""
target: test text not exist in multi_analyzer_params
method: create collection with by_field not exist
expected: collection creation should fail because text field is nullable
"""
multi_analyzer_params = {
"by_field": "language",
"analyzers": {
"en": {"type": "english"},
"default": {"tokenizer": "standard"},
},
}
fields = [
FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True),
FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16),
FieldSchema(
name="article_content",
dtype=DataType.VARCHAR,
max_length=1024,
enable_analyzer=True,
multi_analyzer_params=multi_analyzer_params,
nullable=True,
),
FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(
fields=fields, description="Invalid multi-analyzer test"
)
bm25_func = Function(
name="bm25",
function_type=FunctionType.BM25,
input_field_names=["article_content"],
output_field_names=["bm25_sparse_vector"],
)
schema.add_function(bm25_func)
c_name = cf.gen_unique_str(prefix)
error = {
ct.err_code: 65535,
ct.err_msg: "function input field cannot be nullable",
}
self.init_collection_wrap(
name=c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error
)
@pytest.mark.tags(CaseLabel.L0)
def test_missing_default_analyzer(self):
"""