enhance: support run anayser return detaild token (#40458)

relate: https://github.com/milvus-io/milvus/issues/39705

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2025-03-19 15:48:15 +08:00 committed by GitHub
parent 6c55db44f1
commit 92bdf7a0c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 132 additions and 30 deletions

4
go.mod
View File

@ -18,12 +18,12 @@ require (
github.com/gin-gonic/gin v1.9.1
github.com/go-playground/validator/v10 v10.14.0
github.com/gofrs/flock v0.8.1
github.com/golang/protobuf v1.5.4 // indirect
github.com/golang/protobuf v1.5.4
github.com/google/btree v1.1.2
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/klauspost/compress v1.17.9
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250225103150-0a1988183e53
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250305065753-10afe827b61e
github.com/minio/minio-go/v7 v7.0.73
github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81
github.com/prometheus/client_golang v1.14.0

4
go.sum
View File

@ -734,8 +734,8 @@ github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6 h1:YHMFI6L
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250225103150-0a1988183e53 h1:HoaZPKnE/LhkubU7f8qN8J4LfDIroiqoufWS0kPumM4=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250225103150-0a1988183e53/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250305065753-10afe827b61e h1:3wuhvb3a1Oq1NRPJpCpatKxfPR8XCdpZmRAgkF2u4Sg=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250305065753-10afe827b61e/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/pulsar-client-go v0.12.1 h1:O2JZp1tsYiO7C0MQ4hrUY/aJXnn2Gry6hpm7UodghmE=
github.com/milvus-io/pulsar-client-go v0.12.1/go.mod h1:dkutuH4oS2pXiGm+Ti7fQZ4MRjrMPZ8IJeEGAWMeckk=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=

View File

@ -8,10 +8,6 @@
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <stdlib.h>
#include <string.h>
#include "segcore/token_stream_c.h"
#include "token-stream.h"
@ -32,6 +28,15 @@ token_stream_get_token(CTokenStream token_stream) {
->get_token_no_copy();
}
CToken
token_stream_get_detailed_token(CTokenStream token_stream) {
auto token= static_cast<milvus::tantivy::TokenStream*>(token_stream)
->get_detailed_token();
return CToken{
token.token, token.start_offset, token.end_offset, token.position, token.position_length
};
}
void
free_token(void* token) {
free_rust_string(static_cast<const char*>(token));

View File

@ -21,6 +21,13 @@ extern "C" {
#endif
typedef void* CTokenStream;
typedef struct CToken{
const char *token;
int64_t start_offset;
int64_t end_offset;
int64_t position;
int64_t position_length;
}CToken;
void free_token_stream(CTokenStream);
@ -29,6 +36,8 @@ bool token_stream_advance(CTokenStream);
// Note: returned string must be freed by the caller.
const char* token_stream_get_token(CTokenStream);
CToken token_stream_get_detailed_token(CTokenStream);
void
free_token(void* token);

View File

@ -71,6 +71,14 @@ struct RustResult {
const char *error;
};
struct TantivyToken {
const char *token;
int64_t start_offset;
int64_t end_offset;
int64_t position;
int64_t position_length;
};
extern "C" {
void free_rust_array(RustArray array);
@ -342,6 +350,8 @@ bool tantivy_token_stream_advance(void *token_stream);
const char *tantivy_token_stream_get_token(void *token_stream);
TantivyToken tantivy_token_stream_get_detailed_token(void *token_stream);
RustResult tantivy_create_analyzer(const char *analyzer_params);
void *tantivy_clone_analyzer(void *ptr);

View File

@ -1,7 +1,7 @@
use std::ffi::c_char;
use libc::c_void;
use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer};
use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer, Token};
use crate::string_c::c_str_to_str;
use crate::{
@ -9,6 +9,28 @@ use crate::{
util::{create_binding, free_binding},
};
#[repr(C)]
pub struct TantivyToken{
pub token: *const c_char,
pub start_offset: i64,
pub end_offset:i64,
pub position:i64,
pub position_length:i64,
}
impl TantivyToken{
pub fn from_token(token: &Token) -> Self{
TantivyToken{
token: create_string(&token.text),
start_offset: token.offset_from as i64,
end_offset: token.offset_to as i64,
position: token.position as i64,
position_length: token.position_length as i64,
}
}
}
// Note: the tokenizer and text must be released after the token_stream.
#[no_mangle]
pub extern "C" fn tantivy_create_token_stream(
@ -38,3 +60,9 @@ pub extern "C" fn tantivy_token_stream_get_token(token_stream: *mut c_void) -> *
let token = unsafe { (*real).token().text.as_str() };
create_string(token)
}
#[no_mangle]
pub extern "C" fn tantivy_token_stream_get_detailed_token(token_stream: *mut c_void) -> TantivyToken {
let real = token_stream as *mut BoxTokenStream<'_>;
TantivyToken::from_token(unsafe { (*real).token()})
}

View File

@ -8,6 +8,8 @@
#include "rust-binding.h"
namespace milvus::tantivy {
using Token = TantivyToken;
struct TokenStream {
public:
NO_COPY_OR_ASSIGN(TokenStream);
@ -37,6 +39,10 @@ struct TokenStream {
return s;
}
TantivyToken get_detailed_token() {
return tantivy_token_stream_get_detailed_token(ptr_);
}
// Note: the returned token must be freed by calling `free_rust_string`.
const char*
get_token_no_copy() {

View File

@ -10,6 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <cstdint>
#include "common/EasyAssert.h"
#include "pb/schema.pb.h"
@ -59,6 +60,7 @@ TEST(CTokenizer, Default) {
create_token_stream(tokenizer, text.c_str(), text.length());
std::vector<std::string> refs{"football", "basketball", "swimming"};
std::vector<std::int64_t> offsets{0, 10, 22};
for (int i = 0; i < 3; i++) {
ASSERT_TRUE(token_stream_advance(token_stream));
auto token = token_stream_get_token(token_stream);
@ -68,5 +70,20 @@ TEST(CTokenizer, Default) {
ASSERT_FALSE(token_stream_advance(token_stream));
free_token_stream(token_stream);
token_stream =
create_token_stream(tokenizer, text.c_str(), text.length());
for (int i = 0; i < 3; i++) {
ASSERT_TRUE(token_stream_advance(token_stream));
auto token = token_stream_get_detailed_token(token_stream);
ASSERT_EQ(refs[i], std::string(token.token));
ASSERT_EQ(offsets[i], token.start_offset);
free_token(const_cast<char*>(token.token));
}
ASSERT_FALSE(token_stream_advance(token_stream));
free_token_stream(token_stream);
free_tokenizer(tokenizer);
}

View File

@ -1159,7 +1159,7 @@ func (s *Server) DescribeDatabase(ctx context.Context, req *milvuspb.DescribeDat
return s.proxy.DescribeDatabase(ctx, req)
}
func (s *Server) RunAnalyzer(ctx context.Context, req *milvuspb.RunAnalyzerRequset) (*milvuspb.RunAnalyzerResponse, error) {
func (s *Server) RunAnalyzer(ctx context.Context, req *milvuspb.RunAnalyzerRequest) (*milvuspb.RunAnalyzerResponse, error) {
return s.proxy.RunAnalyzer(ctx, req)
}

View File

@ -5891,7 +5891,7 @@ func (_c *MockProxy_RestoreRBAC_Call) RunAndReturn(run func(context.Context, *mi
}
// RunAnalyzer provides a mock function with given fields: _a0, _a1
func (_m *MockProxy) RunAnalyzer(_a0 context.Context, _a1 *milvuspb.RunAnalyzerRequset) (*milvuspb.RunAnalyzerResponse, error) {
func (_m *MockProxy) RunAnalyzer(_a0 context.Context, _a1 *milvuspb.RunAnalyzerRequest) (*milvuspb.RunAnalyzerResponse, error) {
ret := _m.Called(_a0, _a1)
if len(ret) == 0 {
@ -5900,10 +5900,10 @@ func (_m *MockProxy) RunAnalyzer(_a0 context.Context, _a1 *milvuspb.RunAnalyzerR
var r0 *milvuspb.RunAnalyzerResponse
var r1 error
if rf, ok := ret.Get(0).(func(context.Context, *milvuspb.RunAnalyzerRequset) (*milvuspb.RunAnalyzerResponse, error)); ok {
if rf, ok := ret.Get(0).(func(context.Context, *milvuspb.RunAnalyzerRequest) (*milvuspb.RunAnalyzerResponse, error)); ok {
return rf(_a0, _a1)
}
if rf, ok := ret.Get(0).(func(context.Context, *milvuspb.RunAnalyzerRequset) *milvuspb.RunAnalyzerResponse); ok {
if rf, ok := ret.Get(0).(func(context.Context, *milvuspb.RunAnalyzerRequest) *milvuspb.RunAnalyzerResponse); ok {
r0 = rf(_a0, _a1)
} else {
if ret.Get(0) != nil {
@ -5911,7 +5911,7 @@ func (_m *MockProxy) RunAnalyzer(_a0 context.Context, _a1 *milvuspb.RunAnalyzerR
}
}
if rf, ok := ret.Get(1).(func(context.Context, *milvuspb.RunAnalyzerRequset) error); ok {
if rf, ok := ret.Get(1).(func(context.Context, *milvuspb.RunAnalyzerRequest) error); ok {
r1 = rf(_a0, _a1)
} else {
r1 = ret.Error(1)
@ -5927,14 +5927,14 @@ type MockProxy_RunAnalyzer_Call struct {
// RunAnalyzer is a helper method to define mock.On call
// - _a0 context.Context
// - _a1 *milvuspb.RunAnalyzerRequset
// - _a1 *milvuspb.RunAnalyzerRequest
func (_e *MockProxy_Expecter) RunAnalyzer(_a0 interface{}, _a1 interface{}) *MockProxy_RunAnalyzer_Call {
return &MockProxy_RunAnalyzer_Call{Call: _e.mock.On("RunAnalyzer", _a0, _a1)}
}
func (_c *MockProxy_RunAnalyzer_Call) Run(run func(_a0 context.Context, _a1 *milvuspb.RunAnalyzerRequset)) *MockProxy_RunAnalyzer_Call {
func (_c *MockProxy_RunAnalyzer_Call) Run(run func(_a0 context.Context, _a1 *milvuspb.RunAnalyzerRequest)) *MockProxy_RunAnalyzer_Call {
_c.Call.Run(func(args mock.Arguments) {
run(args[0].(context.Context), args[1].(*milvuspb.RunAnalyzerRequset))
run(args[0].(context.Context), args[1].(*milvuspb.RunAnalyzerRequest))
})
return _c
}
@ -5944,7 +5944,7 @@ func (_c *MockProxy_RunAnalyzer_Call) Return(_a0 *milvuspb.RunAnalyzerResponse,
return _c
}
func (_c *MockProxy_RunAnalyzer_Call) RunAndReturn(run func(context.Context, *milvuspb.RunAnalyzerRequset) (*milvuspb.RunAnalyzerResponse, error)) *MockProxy_RunAnalyzer_Call {
func (_c *MockProxy_RunAnalyzer_Call) RunAndReturn(run func(context.Context, *milvuspb.RunAnalyzerRequest) (*milvuspb.RunAnalyzerResponse, error)) *MockProxy_RunAnalyzer_Call {
_c.Call.Return(run)
return _c
}

View File

@ -6918,7 +6918,7 @@ func (node *Proxy) OperatePrivilegeGroup(ctx context.Context, req *milvuspb.Oper
return result, nil
}
func (node *Proxy) RunAnalyzer(ctx context.Context, req *milvuspb.RunAnalyzerRequset) (*milvuspb.RunAnalyzerResponse, error) {
func (node *Proxy) RunAnalyzer(ctx context.Context, req *milvuspb.RunAnalyzerRequest) (*milvuspb.RunAnalyzerResponse, error) {
// TODO: use collection analyzer when collection name and field name not none
tokenizer, err := ctokenizer.NewTokenizer(req.GetAnalyzerParams())
if err != nil {
@ -6932,13 +6932,23 @@ func (node *Proxy) RunAnalyzer(ctx context.Context, req *milvuspb.RunAnalyzerReq
for i, text := range req.GetPlaceholder() {
stream := tokenizer.NewTokenStream(string(text))
defer stream.Destroy()
tokens := []string{}
for stream.Advance() {
token := stream.Token()
tokens = append(tokens, token)
}
results[i] = &milvuspb.AnalyzerResult{
Tokens: tokens,
Tokens: make([]*milvuspb.AnalyzerToken, 0),
}
for stream.Advance() {
var token *milvuspb.AnalyzerToken
if req.GetWithDetail() {
token = stream.DetailedToken()
} else {
token = &milvuspb.AnalyzerToken{Token: stream.Token()}
}
if req.GetWithHash() {
token.Hash = typeutil.HashString2LessUint32(token.GetToken())
}
results[i].Tokens = append(results[i].Tokens, token)
}
}

View File

@ -2230,7 +2230,7 @@ func TestAlterCollectionReplicateProperty(t *testing.T) {
func TestRunAnalyzer(t *testing.T) {
p := &Proxy{}
// run analyzer with default params
resp, err := p.RunAnalyzer(context.Background(), &milvuspb.RunAnalyzerRequset{
resp, err := p.RunAnalyzer(context.Background(), &milvuspb.RunAnalyzerRequest{
Placeholder: [][]byte{[]byte("test doc")},
})
require.NoError(t, err)
@ -2238,7 +2238,7 @@ func TestRunAnalyzer(t *testing.T) {
assert.Equal(t, len(resp.GetResults()[0].GetTokens()), 2)
// run analyzer with invalid params
resp, err = p.RunAnalyzer(context.Background(), &milvuspb.RunAnalyzerRequset{
resp, err = p.RunAnalyzer(context.Background(), &milvuspb.RunAnalyzerRequest{
Placeholder: [][]byte{[]byte("test doc")},
AnalyzerParams: "invalid json",
})

View File

@ -10,6 +10,7 @@ import "C"
import (
"unsafe"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
)
@ -35,6 +36,18 @@ func (impl *CTokenStream) Token() string {
return C.GoString(token)
}
func (impl *CTokenStream) DetailedToken() *milvuspb.AnalyzerToken {
token := C.token_stream_get_detailed_token(impl.ptr)
defer C.free_token(unsafe.Pointer(token.token))
return &milvuspb.AnalyzerToken{
Token: C.GoString(token.token),
StartOffset: int64(token.start_offset),
EndOffset: int64(token.end_offset),
Position: int64(token.position),
PositionLength: int64(token.position_length),
}
}
func (impl *CTokenStream) Destroy() {
C.free_token_stream(impl.ptr)
}

View File

@ -1,8 +1,11 @@
package tokenizerapi
import "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
//go:generate mockery --name=TokenStream --with-expecter
type TokenStream interface {
Advance() bool
Token() string
DetailedToken() *milvuspb.AnalyzerToken
Destroy()
}

View File

@ -20,7 +20,7 @@ require (
github.com/jolestar/go-commons-pool/v2 v2.1.2
github.com/json-iterator/go v1.1.12
github.com/klauspost/compress v1.17.9
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250225103150-0a1988183e53
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250305065753-10afe827b61e
github.com/minio/minio-go/v7 v7.0.73
github.com/nats-io/nats-server/v2 v2.10.12
github.com/nats-io/nats.go v1.34.1

View File

@ -551,8 +551,8 @@ github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6 h1:YHMFI6L
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250225103150-0a1988183e53 h1:HoaZPKnE/LhkubU7f8qN8J4LfDIroiqoufWS0kPumM4=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250225103150-0a1988183e53/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250305065753-10afe827b61e h1:3wuhvb3a1Oq1NRPJpCpatKxfPR8XCdpZmRAgkF2u4Sg=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250305065753-10afe827b61e/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/pulsar-client-go v0.12.1 h1:O2JZp1tsYiO7C0MQ4hrUY/aJXnn2Gry6hpm7UodghmE=
github.com/milvus-io/pulsar-client-go v0.12.1/go.mod h1:dkutuH4oS2pXiGm+Ti7fQZ4MRjrMPZ8IJeEGAWMeckk=
github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g=

View File

@ -32,6 +32,7 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
{"tokenizer": {"type": "lindera", "dict_kind": "cc-cedict"}},
]
@pytest.mark.skip()
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("analyzer_params", analyzer_params_list)
def test_analyzer(self, analyzer_params):