mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 14:35:27 +08:00
Support crate analyzer with file resource info, and return used file resource ids when validate analyzer. Save the related resource ids in collection schema. relate: https://github.com/milvus-io/milvus/issues/43687 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: analyzer file-resource resolution is deterministic and traceable by threading a FileResourcePathHelper (collecting used resource IDs in a HashSet) through all tokenizer/analyzer construction and validation paths; validate_analyzer(params, extra_info) returns the collected Vec<i64) which is propagated through C/Rust/Go layers to callers (CValidateResult → RustResult::from_vec_i64 → Go []int64 → querypb.ValidateAnalyzerResponse.ResourceIds → CollectionSchema.FileResourceIds). - Logic removed/simplified: ad‑hoc, scattered resource-path lookups and per-filter file helpers (e.g., read_synonyms_file and other inline file-reading logic) were consolidated into ResourceInfo + FileResourcePathHelper and a centralized get_resource_path(helper, ...) API; filter/tokenizer builder APIs now accept &mut FileResourcePathHelper so all file path resolution and ID collection use the same path and bookkeeping logic (redundant duplicated lookups removed). - Why no data loss or behavior regression: changes are additive and default-preserving — existing call sites pass extra_info = "" so analyzer creation/validation behavior and error paths remain unchanged; new Collection.FileResourceIds is populated from resp.ResourceIds in validateSchema and round‑tripped through marshal/unmarshal (model.Collection ↔ schemapb.CollectionSchema) so schema persistence uses the new list without overwriting other schema fields; proto change adds a repeated field (resource_ids) which is wire‑compatible (older clients ignore extra field). Concrete code paths: analyzer creation still uses create_analyzer (now with extra_info ""), tokenizer validation still returns errors as before but now also returns IDs via CValidateResult/RustResult, and rootcoord.validateSchema assigns resp.ResourceIds → schema.FileResourceIds. - New capability added: end‑to‑end discovery, return, and persistence of file resource IDs used by analyzers — validate flows now return resource IDs and the system stores them in collection schema (affects tantivy analyzer binding, canalyzer C bindings, internal/util analyzer APIs, querynode ValidateAnalyzer response, and rootcoord/create_collection flow). <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
238 lines
5.9 KiB
Go
238 lines
5.9 KiB
Go
package canalyzer
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"google.golang.org/grpc"
|
|
|
|
pb "github.com/milvus-io/milvus-proto/go-api/v2/tokenizerpb"
|
|
"github.com/milvus-io/milvus/internal/util/pathutil"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
|
)
|
|
|
|
type mockServer struct {
|
|
pb.UnimplementedTokenizerServer
|
|
}
|
|
|
|
func (s *mockServer) Tokenize(ctx context.Context, req *pb.TokenizationRequest) (*pb.TokenizationResponse, error) {
|
|
ret := []*pb.Token{}
|
|
for _, token := range strings.Split(req.Text, ",") {
|
|
ret = append(ret, &pb.Token{
|
|
Text: strings.TrimSpace(token),
|
|
})
|
|
}
|
|
return &pb.TokenizationResponse{Tokens: ret}, nil
|
|
}
|
|
|
|
func TestAnalyzer(t *testing.T) {
|
|
// use default analyzer.
|
|
{
|
|
m := "{}"
|
|
analyzer, err := NewAnalyzer(m, "")
|
|
assert.NoError(t, err)
|
|
defer analyzer.Destroy()
|
|
|
|
tokenStream := analyzer.NewTokenStream("football, basketball, pingpang")
|
|
defer tokenStream.Destroy()
|
|
|
|
tokens := []string{}
|
|
for tokenStream.Advance() {
|
|
tokens = append(tokens, tokenStream.Token())
|
|
}
|
|
assert.Equal(t, len(tokens), 3)
|
|
}
|
|
|
|
{
|
|
m := ""
|
|
analyzer, err := NewAnalyzer(m, "")
|
|
assert.NoError(t, err)
|
|
defer analyzer.Destroy()
|
|
|
|
tokenStream := analyzer.NewTokenStream("football, basketball, pingpang")
|
|
defer tokenStream.Destroy()
|
|
|
|
tokens := []string{}
|
|
for tokenStream.Advance() {
|
|
tokens = append(tokens, tokenStream.Token())
|
|
}
|
|
assert.Equal(t, len(tokens), 3)
|
|
}
|
|
|
|
// use default tokenizer.
|
|
{
|
|
m := "{\"tokenizer\": \"standard\"}"
|
|
analyzer, err := NewAnalyzer(m, "")
|
|
assert.NoError(t, err)
|
|
defer analyzer.Destroy()
|
|
|
|
tokenStream := analyzer.NewTokenStream("football, basketball, pingpang")
|
|
defer tokenStream.Destroy()
|
|
|
|
tokens := []string{}
|
|
for tokenStream.Advance() {
|
|
tokens = append(tokens, tokenStream.Token())
|
|
}
|
|
assert.Equal(t, len(tokens), 3)
|
|
}
|
|
|
|
// jieba tokenizer.
|
|
{
|
|
m := "{\"tokenizer\": \"jieba\"}"
|
|
analyzer, err := NewAnalyzer(m, "")
|
|
assert.NoError(t, err)
|
|
defer analyzer.Destroy()
|
|
|
|
tokenStream := analyzer.NewTokenStream("张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途")
|
|
defer tokenStream.Destroy()
|
|
for tokenStream.Advance() {
|
|
assert.NotEmpty(t, tokenStream.Token())
|
|
}
|
|
}
|
|
|
|
// grpc tokenizer.
|
|
{
|
|
lis, _ := net.Listen("tcp", "127.0.0.1:0")
|
|
s := grpc.NewServer()
|
|
pb.RegisterTokenizerServer(s, &mockServer{})
|
|
go func() {
|
|
if err := s.Serve(lis); err != nil {
|
|
t.Errorf("Server exited with error: %v", err)
|
|
}
|
|
}()
|
|
addr, stop := func() (string, func()) {
|
|
lis, err := net.Listen("tcp", "127.0.0.1:0")
|
|
if err != nil {
|
|
t.Fatalf("failed to listen: %v", err)
|
|
}
|
|
|
|
s := grpc.NewServer()
|
|
pb.RegisterTokenizerServer(s, &mockServer{})
|
|
|
|
go func() {
|
|
_ = s.Serve(lis)
|
|
}()
|
|
|
|
return lis.Addr().String(), func() {
|
|
s.Stop()
|
|
_ = lis.Close()
|
|
}
|
|
}()
|
|
defer stop()
|
|
|
|
m := "{\"tokenizer\": {\"type\":\"grpc\", \"endpoint\":\"http://" + addr + "\"}}"
|
|
analyzer, err := NewAnalyzer(m, "")
|
|
assert.NoError(t, err)
|
|
defer analyzer.Destroy()
|
|
|
|
tokenStream := analyzer.NewTokenStream("football, basketball, pingpang")
|
|
defer tokenStream.Destroy()
|
|
for tokenStream.Advance() {
|
|
fmt.Println(tokenStream.Token())
|
|
}
|
|
}
|
|
|
|
// lindera tokenizer.
|
|
{
|
|
m := "{\"tokenizer\": {\"type\":\"lindera\", \"dict_kind\": \"ipadic\"}}"
|
|
tokenizer, err := NewAnalyzer(m, "")
|
|
require.NoError(t, err)
|
|
defer tokenizer.Destroy()
|
|
|
|
tokenStream := tokenizer.NewTokenStream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です")
|
|
defer tokenStream.Destroy()
|
|
for tokenStream.Advance() {
|
|
fmt.Println(tokenStream.Token())
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestValidateAnalyzer(t *testing.T) {
|
|
InitOptions()
|
|
|
|
// valid analyzer
|
|
{
|
|
m := "{\"tokenizer\": \"standard\"}"
|
|
ids, err := ValidateAnalyzer(m, "")
|
|
assert.NoError(t, err)
|
|
assert.Equal(t, len(ids), 0)
|
|
}
|
|
|
|
{
|
|
m := ""
|
|
_, err := ValidateAnalyzer(m, "")
|
|
assert.NoError(t, err)
|
|
}
|
|
|
|
// invalid tokenizer
|
|
{
|
|
m := "{\"tokenizer\": \"invalid\"}"
|
|
_, err := ValidateAnalyzer(m, "")
|
|
assert.Error(t, err)
|
|
}
|
|
|
|
// with user resource
|
|
{
|
|
resourcePath := pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
|
defer os.RemoveAll(resourcePath)
|
|
UpdateParams()
|
|
resourceID := int64(100)
|
|
|
|
// mock remote resource file
|
|
dir := filepath.Join(resourcePath, "default", fmt.Sprintf("%d", resourceID))
|
|
err := os.MkdirAll(dir, os.ModePerm)
|
|
require.NoError(t, err)
|
|
|
|
f, err := os.Create(filepath.Join(dir, "jieba.txt"))
|
|
require.NoError(t, err)
|
|
|
|
f.WriteString("stop")
|
|
f.Close()
|
|
|
|
m := "{\"tokenizer\": \"standard\", \"filter\": [{\"type\": \"stop\", \"stop_words_file\": {\"type\": \"remote\",\"resource_name\": \"jieba_dict\", \"file_name\": \"jieba.txt\"}}]}"
|
|
|
|
ids, err := ValidateAnalyzer(m, "{\"resource_map\": {\"jieba_dict\": 100}, \"storage_name\": \"default\"}")
|
|
require.NoError(t, err)
|
|
assert.Equal(t, len(ids), 1)
|
|
assert.Equal(t, ids[0], resourceID)
|
|
}
|
|
|
|
// with user resource and update global resource info
|
|
{
|
|
resourcePath := pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
|
defer os.RemoveAll(resourcePath)
|
|
UpdateParams()
|
|
resourceID := int64(100)
|
|
|
|
// mock remote resource file
|
|
dir := filepath.Join(resourcePath, fmt.Sprintf("%d", resourceID))
|
|
err := os.MkdirAll(dir, os.ModePerm)
|
|
require.NoError(t, err)
|
|
|
|
f, err := os.Create(filepath.Join(dir, "jieba.txt"))
|
|
require.NoError(t, err)
|
|
|
|
f.WriteString("stop")
|
|
f.Close()
|
|
|
|
m := "{\"tokenizer\": \"standard\", \"filter\": [{\"type\": \"stop\", \"stop_words_file\": {\"type\": \"remote\",\"resource_name\": \"jieba_dict\", \"file_name\": \"jieba.txt\"}}]}"
|
|
|
|
// update global resource info
|
|
err = UpdateGlobalResourceInfo(map[string]int64{"jieba_dict": resourceID})
|
|
require.NoError(t, err)
|
|
|
|
ids, err := ValidateAnalyzer(m, "")
|
|
require.NoError(t, err)
|
|
|
|
assert.Equal(t, len(ids), 1)
|
|
assert.Equal(t, ids[0], resourceID)
|
|
}
|
|
}
|