fix: char_group tokenizer only support one byte char as delimiters (#46193)

relate: https://github.com/milvus-io/milvus/issues/46192

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2025-12-10 14:33:13 +08:00 committed by GitHub
parent 6e2872c982
commit c84b6d56f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -85,7 +85,7 @@ impl CharGroupTokenizer {
.to_string(),
)),
|v| {
if v.len() == 1 {
if v.chars().count() == 1 {
delimiters.insert(v.chars().next().unwrap());
return Ok(());
}
@ -204,7 +204,7 @@ mod tests {
fn test_char_group_tokenizer() {
let params = r#"{
"type": "chargroup",
"delimiters": ["o", "punctuation","digit"]
"delimiters": ["o", "punctuation","digit", ""]
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());