mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
feat: support use fragment config for highlight (#45099)
relate: https://github.com/milvus-io/milvus/issues/42589 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
228eb0f5d0
commit
5efb0cedc8
@ -602,6 +602,11 @@ const (
|
||||
PreTagsKey = "pre_tags"
|
||||
PostTagsKey = "post_tags"
|
||||
HighlightSearchTextKey = "highlight_search_data"
|
||||
FragmentOffsetKey = "fragment_offset"
|
||||
FragmentSizeKey = "fragment_size"
|
||||
FragmentNumKey = "num_of_fragments"
|
||||
DefaultFragmentSize = 100
|
||||
DefaultFragmentNum = 1
|
||||
DefaultPreTag = "<em>"
|
||||
DefaultPostTag = "</em>"
|
||||
)
|
||||
@ -638,6 +643,41 @@ func newHighlightOperator(t *searchTask, _ map[string]any) (operator, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
func sliceByRune(s string, start, end int) string {
|
||||
if start >= end {
|
||||
return ""
|
||||
}
|
||||
|
||||
i, from, to := 0, 0, len(s)
|
||||
for idx := range s {
|
||||
if i == start {
|
||||
from = idx
|
||||
}
|
||||
if i == end {
|
||||
to = idx
|
||||
break
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
return s[from:to]
|
||||
}
|
||||
|
||||
// get slice texts according to fragment options
|
||||
func getHighlightTexts(task *querypb.HighlightTask, datas []string) []string {
|
||||
if task.GetOptions().GetNumOfFragments() == 0 {
|
||||
return datas
|
||||
}
|
||||
|
||||
results := make([]string, len(datas))
|
||||
offset := int(task.GetOptions().GetFragmentOffset())
|
||||
size := offset + int(task.GetOptions().GetFragmentSize()*task.GetOptions().GetNumOfFragments())
|
||||
for i, text := range datas {
|
||||
results[i] = sliceByRune(text, min(offset, len(text)), min(size, len(text)))
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
func (op *highlightOperator) run(ctx context.Context, span trace.Span, inputs ...any) ([]any, error) {
|
||||
result := inputs[0].(*milvuspb.SearchResults)
|
||||
datas := result.Results.GetFieldsData()
|
||||
@ -651,7 +691,7 @@ func (op *highlightOperator) run(ctx context.Context, span trace.Span, inputs ..
|
||||
if !ok {
|
||||
return nil, errors.Errorf("get highlight failed, text field not in output field %s: %d", task.GetFieldName(), task.GetFieldId())
|
||||
}
|
||||
texts := textFieldDatas.GetScalars().GetStringData().GetData()
|
||||
texts := getHighlightTexts(task, textFieldDatas.GetScalars().GetStringData().GetData())
|
||||
task.Texts = append(task.Texts, texts...)
|
||||
task.CorpusTextNum = int64(len(texts))
|
||||
field, ok := lo.Find(op.fieldSchemas, func(schema *schemapb.FieldSchema) bool {
|
||||
|
||||
@ -587,7 +587,12 @@ func (t *searchTask) getBM25SearchTexts(placeholder []byte) ([]string, error) {
|
||||
|
||||
func (t *searchTask) createLexicalHighlighter(highlighter *commonpb.Highlighter, metricType string, annsField int64, placeholder []byte, analyzerName string) error {
|
||||
task := &highlightTask{
|
||||
HighlightTask: &querypb.HighlightTask{},
|
||||
HighlightTask: &querypb.HighlightTask{
|
||||
Options: &querypb.HighlightOptions{
|
||||
FragmentSize: DefaultFragmentSize,
|
||||
NumOfFragments: DefaultFragmentNum,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
params := funcutil.KeyValuePair2Map(highlighter.GetParams())
|
||||
@ -649,6 +654,31 @@ func (t *searchTask) createLexicalHighlighter(highlighter *commonpb.Highlighter,
|
||||
task.postTags = [][]byte{[]byte(DefaultPostTag)}
|
||||
}
|
||||
|
||||
// set fragment config
|
||||
if value, ok := params[FragmentSizeKey]; ok {
|
||||
fragmentSize, err := strconv.ParseInt(value, 10, 64)
|
||||
if err != nil || fragmentSize <= 0 {
|
||||
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
|
||||
}
|
||||
task.Options.FragmentSize = fragmentSize
|
||||
}
|
||||
|
||||
if value, ok := params[FragmentNumKey]; ok {
|
||||
fragmentNum, err := strconv.ParseInt(value, 10, 64)
|
||||
if err != nil || fragmentNum <= 0 {
|
||||
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
|
||||
}
|
||||
task.Options.NumOfFragments = fragmentNum
|
||||
}
|
||||
|
||||
if value, ok := params[FragmentOffsetKey]; ok {
|
||||
fragmentOffset, err := strconv.ParseInt(value, 10, 64)
|
||||
if err != nil || fragmentOffset <= 0 {
|
||||
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
|
||||
}
|
||||
task.Options.NumOfFragments = fragmentOffset
|
||||
}
|
||||
|
||||
// set bm25 search text as query texts
|
||||
texts, err := t.getBM25SearchTexts(placeholder)
|
||||
if err != nil {
|
||||
|
||||
@ -1042,13 +1042,15 @@ func (sd *shardDelegator) GetHighlight(ctx context.Context, req *querypb.GetHigh
|
||||
}
|
||||
|
||||
for j := 0; j < int(topks[i]); j++ {
|
||||
offsets := []int64{}
|
||||
spans := SpanList{}
|
||||
for _, token := range corpusResults[corpusIdx] {
|
||||
if tokenSet.Contain(token.GetToken()) {
|
||||
offsets = append(offsets, token.GetStartOffset(), token.GetEndOffset())
|
||||
spans = append(spans, Span{token.GetStartOffset(), token.GetEndOffset()})
|
||||
}
|
||||
}
|
||||
result = append(result, &querypb.HighlightResult{Fragments: []*querypb.HighlightFragment{{StartOffset: 0, EndOffset: int64(len(task.Texts[int(task.SearchTextNum)+corpusIdx])), Offsets: offsets}}})
|
||||
spans = mergeOffsets(spans)
|
||||
frags := fetchFragmentsFromOffsets(task.Texts[int(task.SearchTextNum)+corpusIdx], spans, task.GetOptions().GetFragmentSize(), task.GetOptions().GetNumOfFragments())
|
||||
result = append(result, &querypb.HighlightResult{Fragments: frags})
|
||||
corpusIdx++
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,6 +2,8 @@ package delegator
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"unicode/utf8"
|
||||
|
||||
"go.uber.org/zap"
|
||||
"google.golang.org/protobuf/proto"
|
||||
@ -10,6 +12,7 @@ import (
|
||||
"github.com/milvus-io/milvus/pkg/v2/log"
|
||||
"github.com/milvus-io/milvus/pkg/v2/proto/internalpb"
|
||||
"github.com/milvus-io/milvus/pkg/v2/proto/planpb"
|
||||
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
||||
)
|
||||
|
||||
@ -62,3 +65,92 @@ func SetBM25Params(req *internalpb.SearchRequest, avgdl float64) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type (
|
||||
Span [2]int64
|
||||
SpanList []Span
|
||||
)
|
||||
|
||||
func (a SpanList) Len() int { return len(a) }
|
||||
func (a SpanList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a SpanList) Less(i, j int) bool {
|
||||
if a[i][0] == a[j][0] {
|
||||
return a[i][1] < a[j][1]
|
||||
}
|
||||
return a[i][0] < a[j][0]
|
||||
}
|
||||
|
||||
// merge repeated segments
|
||||
func mergeOffsets(input SpanList) SpanList {
|
||||
sort.Sort(input)
|
||||
maxEndOffset := int64(-1)
|
||||
offsets := SpanList{}
|
||||
for _, pair := range input {
|
||||
if pair[1] > maxEndOffset {
|
||||
if len(offsets) == 0 || pair[0] > offsets[len(offsets)-1][1] {
|
||||
// if start offset > max offset before,
|
||||
// no any intersection with previous one,
|
||||
// use all pair.
|
||||
offsets = append(offsets, pair)
|
||||
} else {
|
||||
// if start offset <= max offset before,
|
||||
// has intersection with previous one,
|
||||
// merge two offset to one.
|
||||
offsets[len(offsets)-1][1] = pair[1]
|
||||
}
|
||||
maxEndOffset = pair[1]
|
||||
}
|
||||
}
|
||||
return offsets
|
||||
}
|
||||
|
||||
func fetchFragmentsFromOffsets(text string, span SpanList, fragmentSize int64, numOfFragments int64) []*querypb.HighlightFragment {
|
||||
result := make([]*querypb.HighlightFragment, 0)
|
||||
endPosition := int(fragmentSize)
|
||||
nowOffset := 0
|
||||
frag := &querypb.HighlightFragment{
|
||||
StartOffset: 0,
|
||||
}
|
||||
|
||||
next := func() {
|
||||
endPosition += int(fragmentSize)
|
||||
frag.EndOffset = int64(nowOffset)
|
||||
result = append(result, frag)
|
||||
frag = &querypb.HighlightFragment{
|
||||
StartOffset: int64(nowOffset),
|
||||
}
|
||||
}
|
||||
|
||||
cursor := 0
|
||||
spanNum := len(span)
|
||||
for i, r := range text {
|
||||
nowOffset += utf8.RuneLen(r)
|
||||
|
||||
// append if span was included in current fragment
|
||||
for ; cursor < spanNum && span[cursor][1] <= int64(nowOffset); cursor++ {
|
||||
if span[cursor][0] >= frag.StartOffset {
|
||||
frag.Offsets = append(frag.Offsets, span[cursor][0], span[cursor][1])
|
||||
} else {
|
||||
// if some span cross fragment start, append the part in current fragment
|
||||
frag.Offsets = append(frag.Offsets, frag.StartOffset, span[cursor][1])
|
||||
}
|
||||
}
|
||||
|
||||
if i >= endPosition {
|
||||
// if some span cross fragment end, append the part in current fragment
|
||||
if cursor < spanNum && span[cursor][0] < int64(nowOffset) {
|
||||
frag.Offsets = append(frag.Offsets, span[cursor][0], int64(nowOffset))
|
||||
}
|
||||
next()
|
||||
// skip all if no span remain or get enough num of fragments
|
||||
if cursor >= spanNum || int64(len(result)) >= numOfFragments {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if nowOffset > int(frag.StartOffset) {
|
||||
next()
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
@ -1014,6 +1014,12 @@ message ValidateAnalyzerRequest{
|
||||
repeated AnalyzerInfo analyzer_infos = 2;
|
||||
}
|
||||
|
||||
message HighlightOptions{
|
||||
int64 fragment_size = 1;
|
||||
int64 fragment_offset = 2;
|
||||
int64 num_of_fragments = 3;
|
||||
}
|
||||
|
||||
// HighlightTask fetch highlight for all queries at one field
|
||||
// len(texts) == search_text_num + corpus_text_num
|
||||
message HighlightTask{
|
||||
@ -1024,6 +1030,7 @@ message HighlightTask{
|
||||
|
||||
int64 search_text_num = 5;
|
||||
int64 corpus_text_num = 6;
|
||||
HighlightOptions options = 7;
|
||||
}
|
||||
|
||||
message GetHighlightRequest{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user