mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
feat: support use fragment config for highlight (#45099)
relate: https://github.com/milvus-io/milvus/issues/42589 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
228eb0f5d0
commit
5efb0cedc8
@ -602,6 +602,11 @@ const (
|
|||||||
PreTagsKey = "pre_tags"
|
PreTagsKey = "pre_tags"
|
||||||
PostTagsKey = "post_tags"
|
PostTagsKey = "post_tags"
|
||||||
HighlightSearchTextKey = "highlight_search_data"
|
HighlightSearchTextKey = "highlight_search_data"
|
||||||
|
FragmentOffsetKey = "fragment_offset"
|
||||||
|
FragmentSizeKey = "fragment_size"
|
||||||
|
FragmentNumKey = "num_of_fragments"
|
||||||
|
DefaultFragmentSize = 100
|
||||||
|
DefaultFragmentNum = 1
|
||||||
DefaultPreTag = "<em>"
|
DefaultPreTag = "<em>"
|
||||||
DefaultPostTag = "</em>"
|
DefaultPostTag = "</em>"
|
||||||
)
|
)
|
||||||
@ -638,6 +643,41 @@ func newHighlightOperator(t *searchTask, _ map[string]any) (operator, error) {
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func sliceByRune(s string, start, end int) string {
|
||||||
|
if start >= end {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
i, from, to := 0, 0, len(s)
|
||||||
|
for idx := range s {
|
||||||
|
if i == start {
|
||||||
|
from = idx
|
||||||
|
}
|
||||||
|
if i == end {
|
||||||
|
to = idx
|
||||||
|
break
|
||||||
|
}
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
|
||||||
|
return s[from:to]
|
||||||
|
}
|
||||||
|
|
||||||
|
// get slice texts according to fragment options
|
||||||
|
func getHighlightTexts(task *querypb.HighlightTask, datas []string) []string {
|
||||||
|
if task.GetOptions().GetNumOfFragments() == 0 {
|
||||||
|
return datas
|
||||||
|
}
|
||||||
|
|
||||||
|
results := make([]string, len(datas))
|
||||||
|
offset := int(task.GetOptions().GetFragmentOffset())
|
||||||
|
size := offset + int(task.GetOptions().GetFragmentSize()*task.GetOptions().GetNumOfFragments())
|
||||||
|
for i, text := range datas {
|
||||||
|
results[i] = sliceByRune(text, min(offset, len(text)), min(size, len(text)))
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
func (op *highlightOperator) run(ctx context.Context, span trace.Span, inputs ...any) ([]any, error) {
|
func (op *highlightOperator) run(ctx context.Context, span trace.Span, inputs ...any) ([]any, error) {
|
||||||
result := inputs[0].(*milvuspb.SearchResults)
|
result := inputs[0].(*milvuspb.SearchResults)
|
||||||
datas := result.Results.GetFieldsData()
|
datas := result.Results.GetFieldsData()
|
||||||
@ -651,7 +691,7 @@ func (op *highlightOperator) run(ctx context.Context, span trace.Span, inputs ..
|
|||||||
if !ok {
|
if !ok {
|
||||||
return nil, errors.Errorf("get highlight failed, text field not in output field %s: %d", task.GetFieldName(), task.GetFieldId())
|
return nil, errors.Errorf("get highlight failed, text field not in output field %s: %d", task.GetFieldName(), task.GetFieldId())
|
||||||
}
|
}
|
||||||
texts := textFieldDatas.GetScalars().GetStringData().GetData()
|
texts := getHighlightTexts(task, textFieldDatas.GetScalars().GetStringData().GetData())
|
||||||
task.Texts = append(task.Texts, texts...)
|
task.Texts = append(task.Texts, texts...)
|
||||||
task.CorpusTextNum = int64(len(texts))
|
task.CorpusTextNum = int64(len(texts))
|
||||||
field, ok := lo.Find(op.fieldSchemas, func(schema *schemapb.FieldSchema) bool {
|
field, ok := lo.Find(op.fieldSchemas, func(schema *schemapb.FieldSchema) bool {
|
||||||
|
|||||||
@ -587,7 +587,12 @@ func (t *searchTask) getBM25SearchTexts(placeholder []byte) ([]string, error) {
|
|||||||
|
|
||||||
func (t *searchTask) createLexicalHighlighter(highlighter *commonpb.Highlighter, metricType string, annsField int64, placeholder []byte, analyzerName string) error {
|
func (t *searchTask) createLexicalHighlighter(highlighter *commonpb.Highlighter, metricType string, annsField int64, placeholder []byte, analyzerName string) error {
|
||||||
task := &highlightTask{
|
task := &highlightTask{
|
||||||
HighlightTask: &querypb.HighlightTask{},
|
HighlightTask: &querypb.HighlightTask{
|
||||||
|
Options: &querypb.HighlightOptions{
|
||||||
|
FragmentSize: DefaultFragmentSize,
|
||||||
|
NumOfFragments: DefaultFragmentNum,
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
params := funcutil.KeyValuePair2Map(highlighter.GetParams())
|
params := funcutil.KeyValuePair2Map(highlighter.GetParams())
|
||||||
@ -649,6 +654,31 @@ func (t *searchTask) createLexicalHighlighter(highlighter *commonpb.Highlighter,
|
|||||||
task.postTags = [][]byte{[]byte(DefaultPostTag)}
|
task.postTags = [][]byte{[]byte(DefaultPostTag)}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// set fragment config
|
||||||
|
if value, ok := params[FragmentSizeKey]; ok {
|
||||||
|
fragmentSize, err := strconv.ParseInt(value, 10, 64)
|
||||||
|
if err != nil || fragmentSize <= 0 {
|
||||||
|
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
|
||||||
|
}
|
||||||
|
task.Options.FragmentSize = fragmentSize
|
||||||
|
}
|
||||||
|
|
||||||
|
if value, ok := params[FragmentNumKey]; ok {
|
||||||
|
fragmentNum, err := strconv.ParseInt(value, 10, 64)
|
||||||
|
if err != nil || fragmentNum <= 0 {
|
||||||
|
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
|
||||||
|
}
|
||||||
|
task.Options.NumOfFragments = fragmentNum
|
||||||
|
}
|
||||||
|
|
||||||
|
if value, ok := params[FragmentOffsetKey]; ok {
|
||||||
|
fragmentOffset, err := strconv.ParseInt(value, 10, 64)
|
||||||
|
if err != nil || fragmentOffset <= 0 {
|
||||||
|
return merr.WrapErrParameterInvalidMsg("invalid fragment_size: %s", value)
|
||||||
|
}
|
||||||
|
task.Options.NumOfFragments = fragmentOffset
|
||||||
|
}
|
||||||
|
|
||||||
// set bm25 search text as query texts
|
// set bm25 search text as query texts
|
||||||
texts, err := t.getBM25SearchTexts(placeholder)
|
texts, err := t.getBM25SearchTexts(placeholder)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -1042,13 +1042,15 @@ func (sd *shardDelegator) GetHighlight(ctx context.Context, req *querypb.GetHigh
|
|||||||
}
|
}
|
||||||
|
|
||||||
for j := 0; j < int(topks[i]); j++ {
|
for j := 0; j < int(topks[i]); j++ {
|
||||||
offsets := []int64{}
|
spans := SpanList{}
|
||||||
for _, token := range corpusResults[corpusIdx] {
|
for _, token := range corpusResults[corpusIdx] {
|
||||||
if tokenSet.Contain(token.GetToken()) {
|
if tokenSet.Contain(token.GetToken()) {
|
||||||
offsets = append(offsets, token.GetStartOffset(), token.GetEndOffset())
|
spans = append(spans, Span{token.GetStartOffset(), token.GetEndOffset()})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = append(result, &querypb.HighlightResult{Fragments: []*querypb.HighlightFragment{{StartOffset: 0, EndOffset: int64(len(task.Texts[int(task.SearchTextNum)+corpusIdx])), Offsets: offsets}}})
|
spans = mergeOffsets(spans)
|
||||||
|
frags := fetchFragmentsFromOffsets(task.Texts[int(task.SearchTextNum)+corpusIdx], spans, task.GetOptions().GetFragmentSize(), task.GetOptions().GetNumOfFragments())
|
||||||
|
result = append(result, &querypb.HighlightResult{Fragments: frags})
|
||||||
corpusIdx++
|
corpusIdx++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,6 +2,8 @@ package delegator
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
"google.golang.org/protobuf/proto"
|
"google.golang.org/protobuf/proto"
|
||||||
@ -10,6 +12,7 @@ import (
|
|||||||
"github.com/milvus-io/milvus/pkg/v2/log"
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
||||||
"github.com/milvus-io/milvus/pkg/v2/proto/internalpb"
|
"github.com/milvus-io/milvus/pkg/v2/proto/internalpb"
|
||||||
"github.com/milvus-io/milvus/pkg/v2/proto/planpb"
|
"github.com/milvus-io/milvus/pkg/v2/proto/planpb"
|
||||||
|
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
|
||||||
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -62,3 +65,92 @@ func SetBM25Params(req *internalpb.SearchRequest, avgdl float64) error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type (
|
||||||
|
Span [2]int64
|
||||||
|
SpanList []Span
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a SpanList) Len() int { return len(a) }
|
||||||
|
func (a SpanList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||||
|
func (a SpanList) Less(i, j int) bool {
|
||||||
|
if a[i][0] == a[j][0] {
|
||||||
|
return a[i][1] < a[j][1]
|
||||||
|
}
|
||||||
|
return a[i][0] < a[j][0]
|
||||||
|
}
|
||||||
|
|
||||||
|
// merge repeated segments
|
||||||
|
func mergeOffsets(input SpanList) SpanList {
|
||||||
|
sort.Sort(input)
|
||||||
|
maxEndOffset := int64(-1)
|
||||||
|
offsets := SpanList{}
|
||||||
|
for _, pair := range input {
|
||||||
|
if pair[1] > maxEndOffset {
|
||||||
|
if len(offsets) == 0 || pair[0] > offsets[len(offsets)-1][1] {
|
||||||
|
// if start offset > max offset before,
|
||||||
|
// no any intersection with previous one,
|
||||||
|
// use all pair.
|
||||||
|
offsets = append(offsets, pair)
|
||||||
|
} else {
|
||||||
|
// if start offset <= max offset before,
|
||||||
|
// has intersection with previous one,
|
||||||
|
// merge two offset to one.
|
||||||
|
offsets[len(offsets)-1][1] = pair[1]
|
||||||
|
}
|
||||||
|
maxEndOffset = pair[1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return offsets
|
||||||
|
}
|
||||||
|
|
||||||
|
func fetchFragmentsFromOffsets(text string, span SpanList, fragmentSize int64, numOfFragments int64) []*querypb.HighlightFragment {
|
||||||
|
result := make([]*querypb.HighlightFragment, 0)
|
||||||
|
endPosition := int(fragmentSize)
|
||||||
|
nowOffset := 0
|
||||||
|
frag := &querypb.HighlightFragment{
|
||||||
|
StartOffset: 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
next := func() {
|
||||||
|
endPosition += int(fragmentSize)
|
||||||
|
frag.EndOffset = int64(nowOffset)
|
||||||
|
result = append(result, frag)
|
||||||
|
frag = &querypb.HighlightFragment{
|
||||||
|
StartOffset: int64(nowOffset),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cursor := 0
|
||||||
|
spanNum := len(span)
|
||||||
|
for i, r := range text {
|
||||||
|
nowOffset += utf8.RuneLen(r)
|
||||||
|
|
||||||
|
// append if span was included in current fragment
|
||||||
|
for ; cursor < spanNum && span[cursor][1] <= int64(nowOffset); cursor++ {
|
||||||
|
if span[cursor][0] >= frag.StartOffset {
|
||||||
|
frag.Offsets = append(frag.Offsets, span[cursor][0], span[cursor][1])
|
||||||
|
} else {
|
||||||
|
// if some span cross fragment start, append the part in current fragment
|
||||||
|
frag.Offsets = append(frag.Offsets, frag.StartOffset, span[cursor][1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if i >= endPosition {
|
||||||
|
// if some span cross fragment end, append the part in current fragment
|
||||||
|
if cursor < spanNum && span[cursor][0] < int64(nowOffset) {
|
||||||
|
frag.Offsets = append(frag.Offsets, span[cursor][0], int64(nowOffset))
|
||||||
|
}
|
||||||
|
next()
|
||||||
|
// skip all if no span remain or get enough num of fragments
|
||||||
|
if cursor >= spanNum || int64(len(result)) >= numOfFragments {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if nowOffset > int(frag.StartOffset) {
|
||||||
|
next()
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|||||||
@ -1014,6 +1014,12 @@ message ValidateAnalyzerRequest{
|
|||||||
repeated AnalyzerInfo analyzer_infos = 2;
|
repeated AnalyzerInfo analyzer_infos = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message HighlightOptions{
|
||||||
|
int64 fragment_size = 1;
|
||||||
|
int64 fragment_offset = 2;
|
||||||
|
int64 num_of_fragments = 3;
|
||||||
|
}
|
||||||
|
|
||||||
// HighlightTask fetch highlight for all queries at one field
|
// HighlightTask fetch highlight for all queries at one field
|
||||||
// len(texts) == search_text_num + corpus_text_num
|
// len(texts) == search_text_num + corpus_text_num
|
||||||
message HighlightTask{
|
message HighlightTask{
|
||||||
@ -1024,6 +1030,7 @@ message HighlightTask{
|
|||||||
|
|
||||||
int64 search_text_num = 5;
|
int64 search_text_num = 5;
|
||||||
int64 corpus_text_num = 6;
|
int64 corpus_text_num = 6;
|
||||||
|
HighlightOptions options = 7;
|
||||||
}
|
}
|
||||||
|
|
||||||
message GetHighlightRequest{
|
message GetHighlightRequest{
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user