feat: support use fragment config for highlight (#45099)

relate: https://github.com/milvus-io/milvus/issues/42589

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2025-11-24 17:07:06 +08:00 committed by GitHub
parent 228eb0f5d0
commit 5efb0cedc8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 1188 additions and 919 deletions

View File

@ -602,6 +602,11 @@ const (
PreTagsKey = "pre_tags" PreTagsKey = "pre_tags"
PostTagsKey = "post_tags" PostTagsKey = "post_tags"
HighlightSearchTextKey = "highlight_search_data" HighlightSearchTextKey = "highlight_search_data"
FragmentOffsetKey = "fragment_offset"
FragmentSizeKey = "fragment_size"
FragmentNumKey = "num_of_fragments"
DefaultFragmentSize = 100
DefaultFragmentNum = 1
DefaultPreTag = "<em>" DefaultPreTag = "<em>"
DefaultPostTag = "</em>" DefaultPostTag = "</em>"
) )
@ -638,6 +643,41 @@ func newHighlightOperator(t *searchTask, _ map[string]any) (operator, error) {
}, nil }, nil
} }
func sliceByRune(s string, start, end int) string {
if start >= end {
return ""
}
i, from, to := 0, 0, len(s)
for idx := range s {
if i == start {
from = idx
}
if i == end {
to = idx
break
}
i++
}
return s[from:to]
}
// get slice texts according to fragment options
func getHighlightTexts(task *querypb.HighlightTask, datas []string) []string {
if task.GetOptions().GetNumOfFragments() == 0 {
return datas
}
results := make([]string, len(datas))
offset := int(task.GetOptions().GetFragmentOffset())
size := offset + int(task.GetOptions().GetFragmentSize()*task.GetOptions().GetNumOfFragments())
for i, text := range datas {
results[i] = sliceByRune(text, min(offset, len(text)), min(size, len(text)))
}
return results
}
func (op *highlightOperator) run(ctx context.Context, span trace.Span, inputs ...any) ([]any, error) { func (op *highlightOperator) run(ctx context.Context, span trace.Span, inputs ...any) ([]any, error) {
result := inputs[0].(*milvuspb.SearchResults) result := inputs[0].(*milvuspb.SearchResults)
datas := result.Results.GetFieldsData() datas := result.Results.GetFieldsData()
@ -651,7 +691,7 @@ func (op *highlightOperator) run(ctx context.Context, span trace.Span, inputs ..
if !ok { if !ok {
return nil, errors.Errorf("get highlight failed, text field not in output field %s: %d", task.GetFieldName(), task.GetFieldId()) return nil, errors.Errorf("get highlight failed, text field not in output field %s: %d", task.GetFieldName(), task.GetFieldId())
} }
texts := textFieldDatas.GetScalars().GetStringData().GetData() texts := getHighlightTexts(task, textFieldDatas.GetScalars().GetStringData().GetData())
task.Texts = append(task.Texts, texts...) task.Texts = append(task.Texts, texts...)
task.CorpusTextNum = int64(len(texts)) task.CorpusTextNum = int64(len(texts))
field, ok := lo.Find(op.fieldSchemas, func(schema *schemapb.FieldSchema) bool { field, ok := lo.Find(op.fieldSchemas, func(schema *schemapb.FieldSchema) bool {

View File

@ -587,7 +587,12 @@ func (t *searchTask) getBM25SearchTexts(placeholder []byte) ([]string, error) {
func (t *searchTask) createLexicalHighlighter(highlighter *commonpb.Highlighter, metricType string, annsField int64, placeholder []byte, analyzerName string) error { func (t *searchTask) createLexicalHighlighter(highlighter *commonpb.Highlighter, metricType string, annsField int64, placeholder []byte, analyzerName string) error {
task := &highlightTask{ task := &highlightTask{
HighlightTask: &querypb.HighlightTask{}, HighlightTask: &querypb.HighlightTask{
Options: &querypb.HighlightOptions{
FragmentSize: DefaultFragmentSize,
NumOfFragments: DefaultFragmentNum,
},
},
} }
params := funcutil.KeyValuePair2Map(highlighter.GetParams()) params := funcutil.KeyValuePair2Map(highlighter.GetParams())
@ -649,6 +654,31 @@ func (t *searchTask) createLexicalHighlighter(highlighter *commonpb.Highlighter,
task.postTags = [][]byte{[]byte(DefaultPostTag)} task.postTags = [][]byte{[]byte(DefaultPostTag)}
} }
// set fragment config
if value, ok := params[FragmentSizeKey]; ok {
fragmentSize, err := strconv.ParseInt(value, 10, 64)
if err != nil || fragmentSize <= 0 {
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
}
task.Options.FragmentSize = fragmentSize
}
if value, ok := params[FragmentNumKey]; ok {
fragmentNum, err := strconv.ParseInt(value, 10, 64)
if err != nil || fragmentNum <= 0 {
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
}
task.Options.NumOfFragments = fragmentNum
}
if value, ok := params[FragmentOffsetKey]; ok {
fragmentOffset, err := strconv.ParseInt(value, 10, 64)
if err != nil || fragmentOffset <= 0 {
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
}
task.Options.NumOfFragments = fragmentOffset
}
// set bm25 search text as query texts // set bm25 search text as query texts
texts, err := t.getBM25SearchTexts(placeholder) texts, err := t.getBM25SearchTexts(placeholder)
if err != nil { if err != nil {

View File

@ -1042,13 +1042,15 @@ func (sd *shardDelegator) GetHighlight(ctx context.Context, req *querypb.GetHigh
} }
for j := 0; j < int(topks[i]); j++ { for j := 0; j < int(topks[i]); j++ {
offsets := []int64{} spans := SpanList{}
for _, token := range corpusResults[corpusIdx] { for _, token := range corpusResults[corpusIdx] {
if tokenSet.Contain(token.GetToken()) { if tokenSet.Contain(token.GetToken()) {
offsets = append(offsets, token.GetStartOffset(), token.GetEndOffset()) spans = append(spans, Span{token.GetStartOffset(), token.GetEndOffset()})
} }
} }
result = append(result, &querypb.HighlightResult{Fragments: []*querypb.HighlightFragment{{StartOffset: 0, EndOffset: int64(len(task.Texts[int(task.SearchTextNum)+corpusIdx])), Offsets: offsets}}}) spans = mergeOffsets(spans)
frags := fetchFragmentsFromOffsets(task.Texts[int(task.SearchTextNum)+corpusIdx], spans, task.GetOptions().GetFragmentSize(), task.GetOptions().GetNumOfFragments())
result = append(result, &querypb.HighlightResult{Fragments: frags})
corpusIdx++ corpusIdx++
} }
} }

View File

@ -2,6 +2,8 @@ package delegator
import ( import (
"fmt" "fmt"
"sort"
"unicode/utf8"
"go.uber.org/zap" "go.uber.org/zap"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
@ -10,6 +12,7 @@ import (
"github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/internalpb" "github.com/milvus-io/milvus/pkg/v2/proto/internalpb"
"github.com/milvus-io/milvus/pkg/v2/proto/planpb" "github.com/milvus-io/milvus/pkg/v2/proto/planpb"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/merr" "github.com/milvus-io/milvus/pkg/v2/util/merr"
) )
@ -62,3 +65,92 @@ func SetBM25Params(req *internalpb.SearchRequest, avgdl float64) error {
} }
return nil return nil
} }
type (
Span [2]int64
SpanList []Span
)
func (a SpanList) Len() int { return len(a) }
func (a SpanList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a SpanList) Less(i, j int) bool {
if a[i][0] == a[j][0] {
return a[i][1] < a[j][1]
}
return a[i][0] < a[j][0]
}
// merge repeated segments
func mergeOffsets(input SpanList) SpanList {
sort.Sort(input)
maxEndOffset := int64(-1)
offsets := SpanList{}
for _, pair := range input {
if pair[1] > maxEndOffset {
if len(offsets) == 0 || pair[0] > offsets[len(offsets)-1][1] {
// if start offset > max offset before,
// no any intersection with previous one,
// use all pair.
offsets = append(offsets, pair)
} else {
// if start offset <= max offset before,
// has intersection with previous one,
// merge two offset to one.
offsets[len(offsets)-1][1] = pair[1]
}
maxEndOffset = pair[1]
}
}
return offsets
}
func fetchFragmentsFromOffsets(text string, span SpanList, fragmentSize int64, numOfFragments int64) []*querypb.HighlightFragment {
result := make([]*querypb.HighlightFragment, 0)
endPosition := int(fragmentSize)
nowOffset := 0
frag := &querypb.HighlightFragment{
StartOffset: 0,
}
next := func() {
endPosition += int(fragmentSize)
frag.EndOffset = int64(nowOffset)
result = append(result, frag)
frag = &querypb.HighlightFragment{
StartOffset: int64(nowOffset),
}
}
cursor := 0
spanNum := len(span)
for i, r := range text {
nowOffset += utf8.RuneLen(r)
// append if span was included in current fragment
for ; cursor < spanNum && span[cursor][1] <= int64(nowOffset); cursor++ {
if span[cursor][0] >= frag.StartOffset {
frag.Offsets = append(frag.Offsets, span[cursor][0], span[cursor][1])
} else {
// if some span cross fragment start, append the part in current fragment
frag.Offsets = append(frag.Offsets, frag.StartOffset, span[cursor][1])
}
}
if i >= endPosition {
// if some span cross fragment end, append the part in current fragment
if cursor < spanNum && span[cursor][0] < int64(nowOffset) {
frag.Offsets = append(frag.Offsets, span[cursor][0], int64(nowOffset))
}
next()
// skip all if no span remain or get enough num of fragments
if cursor >= spanNum || int64(len(result)) >= numOfFragments {
break
}
}
}
if nowOffset > int(frag.StartOffset) {
next()
}
return result
}

View File

@ -1014,6 +1014,12 @@ message ValidateAnalyzerRequest{
repeated AnalyzerInfo analyzer_infos = 2; repeated AnalyzerInfo analyzer_infos = 2;
} }
message HighlightOptions{
int64 fragment_size = 1;
int64 fragment_offset = 2;
int64 num_of_fragments = 3;
}
// HighlightTask fetch highlight for all queries at one field // HighlightTask fetch highlight for all queries at one field
// len(texts) == search_text_num + corpus_text_num // len(texts) == search_text_num + corpus_text_num
message HighlightTask{ message HighlightTask{
@ -1024,6 +1030,7 @@ message HighlightTask{
int64 search_text_num = 5; int64 search_text_num = 5;
int64 corpus_text_num = 6; int64 corpus_text_num = 6;
HighlightOptions options = 7;
} }
message GetHighlightRequest{ message GetHighlightRequest{

File diff suppressed because it is too large Load Diff