From 71bd07bcf7050e249c66d9d6523db09878fdc204 Mon Sep 17 00:00:00 2001 From: wei liu Date: Wed, 29 Oct 2025 20:12:13 +0800 Subject: [PATCH] fix: Handle empty FieldsData in reduce/rerank for requery scenario (#44917) (#45137) issue: #44909 pr: #44917 When requery optimization is enabled, search results contain IDs but empty FieldsData. During reduce/rerank operations, if the first shard has empty FieldsData while others have data, PrepareResultFieldData initializes an empty array, causing AppendFieldData to panic when accessing array indices. Changes: - Find first non-empty FieldsData as template in 3 functions: reduceAdvanceGroupBy, reduceSearchResultDataWithGroupBy, reduceSearchResultDataNoGroupBy - Add length check before 2 AppendFieldData calls in reduce functions to prevent panic - Improve newRerankOutputs to find first non-empty fieldData using len(FieldsData) check instead of GetSizeOfIDs - Add length check in appendResult before AppendFieldData - Add comprehensive unit tests for empty and partial empty FieldsData scenarios in both reduce and rerank functions This fix handles both pure requery (all empty) and mixed scenarios (some empty, some with data) without breaking normal search flow. The key improvement is checking FieldsData length directly rather than IDs, as requery may have IDs but empty FieldsData. Signed-off-by: Wei Liu --- internal/proxy/search_reduce_util.go | 36 ++++- internal/proxy/search_reduce_util_test.go | 160 +++++++++++++++++++++ internal/util/function/rerank/util.go | 10 +- internal/util/function/rerank/util_test.go | 136 ++++++++++++++++++ 4 files changed, 334 insertions(+), 8 deletions(-) diff --git a/internal/proxy/search_reduce_util.go b/internal/proxy/search_reduce_util.go index 71c556f8ee..b7695438c1 100644 --- a/internal/proxy/search_reduce_util.go +++ b/internal/proxy/search_reduce_util.go @@ -101,7 +101,13 @@ func reduceAdvanceGroupBy(ctx context.Context, subSearchResultData []*schemapb.S } else { ret.GetResults().AllSearchCount = allSearchCount limit = int64(hitNum) - ret.GetResults().FieldsData = typeutil.PrepareResultFieldData(subSearchResultData[0].GetFieldsData(), limit) + // Find the first non-empty FieldsData as template + for _, result := range subSearchResultData { + if len(result.GetFieldsData()) > 0 { + ret.GetResults().FieldsData = typeutil.PrepareResultFieldData(result.GetFieldsData(), limit) + break + } + } } if err := setupIdListForSearchResult(ret, pkType, limit); err != nil { @@ -193,7 +199,7 @@ func reduceSearchResultDataWithGroupBy(ctx context.Context, subSearchResultData Results: &schemapb.SearchResultData{ NumQueries: nq, TopK: topk, - FieldsData: typeutil.PrepareResultFieldData(subSearchResultData[0].GetFieldsData(), limit), + FieldsData: []*schemapb.FieldData{}, Scores: []float32{}, Ids: &schemapb.IDs{}, Topks: []int64{}, @@ -211,6 +217,14 @@ func reduceSearchResultDataWithGroupBy(ctx context.Context, subSearchResultData ret.GetResults().AllSearchCount = allSearchCount } + // Find the first non-empty FieldsData as template + for _, result := range subSearchResultData { + if len(result.GetFieldsData()) > 0 { + ret.GetResults().FieldsData = typeutil.PrepareResultFieldData(result.GetFieldsData(), limit) + break + } + } + var ( subSearchNum = len(subSearchResultData) // for results of each subSearchResultData, storing the start offset of each query of nq queries @@ -289,7 +303,9 @@ func reduceSearchResultDataWithGroupBy(ctx context.Context, subSearchResultData groupEntities := groupByValMap[groupVal] for _, groupEntity := range groupEntities { subResData := subSearchResultData[groupEntity.subSearchIdx] - retSize += typeutil.AppendFieldData(ret.Results.FieldsData, subResData.FieldsData, groupEntity.resultIdx) + if len(ret.Results.FieldsData) > 0 { + retSize += typeutil.AppendFieldData(ret.Results.FieldsData, subResData.FieldsData, groupEntity.resultIdx) + } typeutil.AppendPKs(ret.Results.Ids, groupEntity.id) ret.Results.Scores = append(ret.Results.Scores, groupEntity.score) gpFieldBuilder.Add(groupVal) @@ -336,7 +352,7 @@ func reduceSearchResultDataNoGroupBy(ctx context.Context, subSearchResultData [] Results: &schemapb.SearchResultData{ NumQueries: nq, TopK: topk, - FieldsData: typeutil.PrepareResultFieldData(subSearchResultData[0].GetFieldsData(), limit), + FieldsData: []*schemapb.FieldData{}, Scores: []float32{}, Ids: &schemapb.IDs{}, Topks: []int64{}, @@ -354,6 +370,14 @@ func reduceSearchResultDataNoGroupBy(ctx context.Context, subSearchResultData [] ret.GetResults().AllSearchCount = allSearchCount } + // Find the first non-empty FieldsData as template + for _, result := range subSearchResultData { + if len(result.GetFieldsData()) > 0 { + ret.GetResults().FieldsData = typeutil.PrepareResultFieldData(result.GetFieldsData(), limit) + break + } + } + subSearchNum := len(subSearchResultData) if subSearchNum == 1 && offset == 0 { // sorting is not needed if there is only one shard and no offset, assigning the result directly. @@ -407,7 +431,9 @@ func reduceSearchResultDataNoGroupBy(ctx context.Context, subSearchResultData [] } score := subSearchResultData[subSearchIdx].Scores[resultDataIdx] - retSize += typeutil.AppendFieldData(ret.Results.FieldsData, subSearchResultData[subSearchIdx].FieldsData, resultDataIdx) + if len(ret.Results.FieldsData) > 0 { + retSize += typeutil.AppendFieldData(ret.Results.FieldsData, subSearchResultData[subSearchIdx].FieldsData, resultDataIdx) + } typeutil.CopyPk(ret.Results.Ids, subSearchResultData[subSearchIdx].GetIds(), int(resultDataIdx)) ret.Results.Scores = append(ret.Results.Scores, score) cursors[subSearchIdx]++ diff --git a/internal/proxy/search_reduce_util_test.go b/internal/proxy/search_reduce_util_test.go index 02f2cddeec..f7496044bd 100644 --- a/internal/proxy/search_reduce_util_test.go +++ b/internal/proxy/search_reduce_util_test.go @@ -84,6 +84,166 @@ func (struts *SearchReduceUtilTestSuite) TestReduceSearchResultWithEmtpyGroupDat struts.Nil(results.Results.GetGroupByFieldValue()) } +// TestReduceWithEmptyFieldsData tests reduce functions when FieldsData is empty (requery scenario) +func (struts *SearchReduceUtilTestSuite) TestReduceWithEmptyFieldsData() { + ctx := context.Background() + nq := int64(1) + topK := int64(5) + offset := int64(0) + + // Create search results with empty FieldsData (simulating requery scenario) + searchResultData1 := &schemapb.SearchResultData{ + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{1, 2, 3, 4, 5}, + }, + }, + }, + Scores: []float32{0.9, 0.8, 0.7, 0.6, 0.5}, + Topks: []int64{5}, + NumQueries: nq, + TopK: topK, + FieldsData: []*schemapb.FieldData{}, // Empty FieldsData for requery + } + + searchResultData2 := &schemapb.SearchResultData{ + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{6, 7, 8, 9, 10}, + }, + }, + }, + Scores: []float32{0.85, 0.75, 0.65, 0.55, 0.45}, + Topks: []int64{5}, + NumQueries: nq, + TopK: topK, + FieldsData: []*schemapb.FieldData{}, // Empty FieldsData for requery + } + + // Test reduceSearchResultDataNoGroupBy with empty FieldsData + { + results, err := reduceSearchResultDataNoGroupBy(ctx, []*schemapb.SearchResultData{searchResultData1, searchResultData2}, nq, topK, "L2", schemapb.DataType_Int64, offset) + struts.NoError(err) + struts.NotNil(results) + // Should have merged results without panic + struts.Equal(int64(5), results.Results.Topks[0]) + // FieldsData should be empty since all inputs were empty + struts.Equal(0, len(results.Results.FieldsData)) + } + + // Test reduceSearchResultDataWithGroupBy with empty FieldsData + { + // Add GroupByFieldValue to support group by + searchResultData1.GroupByFieldValue = &schemapb.FieldData{ + Type: schemapb.DataType_VarChar, + FieldName: "group", + FieldId: 101, + Field: &schemapb.FieldData_Scalars{ + Scalars: &schemapb.ScalarField{ + Data: &schemapb.ScalarField_StringData{ + StringData: &schemapb.StringArray{ + Data: []string{"a", "b", "c", "a", "b"}, + }, + }, + }, + }, + } + searchResultData2.GroupByFieldValue = &schemapb.FieldData{ + Type: schemapb.DataType_VarChar, + FieldName: "group", + FieldId: 101, + Field: &schemapb.FieldData_Scalars{ + Scalars: &schemapb.ScalarField{ + Data: &schemapb.ScalarField_StringData{ + StringData: &schemapb.StringArray{ + Data: []string{"c", "a", "b", "c", "a"}, + }, + }, + }, + }, + } + + results, err := reduceSearchResultDataWithGroupBy(ctx, []*schemapb.SearchResultData{searchResultData1, searchResultData2}, nq, topK, "L2", schemapb.DataType_Int64, offset, int64(2)) + struts.NoError(err) + struts.NotNil(results) + // FieldsData should be empty since all inputs were empty + struts.Equal(0, len(results.Results.FieldsData)) + } + + // Test reduceAdvanceGroupBy with empty FieldsData + { + results, err := reduceAdvanceGroupBy(ctx, []*schemapb.SearchResultData{searchResultData1, searchResultData2}, nq, topK, schemapb.DataType_Int64, "L2") + struts.NoError(err) + struts.NotNil(results) + // FieldsData should be empty since all inputs were empty + struts.Equal(0, len(results.Results.FieldsData)) + } +} + +// TestReduceWithPartialEmptyFieldsData tests when first result has empty FieldsData but second has data +func (struts *SearchReduceUtilTestSuite) TestReduceWithPartialEmptyFieldsData() { + ctx := context.Background() + nq := int64(1) + topK := int64(3) + offset := int64(0) + + // First result with empty FieldsData + searchResultData1 := &schemapb.SearchResultData{ + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{1, 2, 3}, + }, + }, + }, + Scores: []float32{0.9, 0.8, 0.7}, + Topks: []int64{3}, + NumQueries: nq, + TopK: topK, + FieldsData: []*schemapb.FieldData{}, // Empty + } + + // Second result with non-empty FieldsData + searchResultData2 := &schemapb.SearchResultData{ + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{4, 5, 6}, + }, + }, + }, + Scores: []float32{0.85, 0.75, 0.65}, + Topks: []int64{3}, + NumQueries: nq, + TopK: topK, + FieldsData: []*schemapb.FieldData{ + { + Type: schemapb.DataType_Int64, + FieldName: "field1", + FieldId: 100, + Field: &schemapb.FieldData_Scalars{ + Scalars: &schemapb.ScalarField{ + Data: &schemapb.ScalarField_LongData{ + LongData: &schemapb.LongArray{ + Data: []int64{40, 50, 60}, + }, + }, + }, + }, + }, + }, + } + + // Test: Should use the non-empty FieldsData from second result + results, err := reduceSearchResultDataNoGroupBy(ctx, []*schemapb.SearchResultData{searchResultData1, searchResultData2}, nq, topK, "L2", schemapb.DataType_Int64, offset) + struts.NoError(err) + struts.NotNil(results) + // Should have initialized FieldsData from second result + struts.Greater(len(results.Results.FieldsData), 0) +} + func TestSearchReduceUtilTestSuite(t *testing.T) { suite.Run(t, new(SearchReduceUtilTestSuite)) } diff --git a/internal/util/function/rerank/util.go b/internal/util/function/rerank/util.go index 2cd544523a..7f552c9133 100644 --- a/internal/util/function/rerank/util.go +++ b/internal/util/function/rerank/util.go @@ -146,8 +146,12 @@ func newRerankOutputs(inputs *rerankInputs, searchParams *SearchParams) *rerankO Ids: &schemapb.IDs{}, Topks: []int64{}, } - if len(inputs.fieldData) > 0 { - ret.FieldsData = typeutil.PrepareResultFieldData(inputs.fieldData[0].GetFieldsData(), searchParams.limit) + // Find the first non-empty fieldData and prepare result fields + for _, fieldData := range inputs.fieldData { + if fieldData != nil && len(fieldData.GetFieldsData()) > 0 { + ret.FieldsData = typeutil.PrepareResultFieldData(fieldData.GetFieldsData(), searchParams.limit) + break + } } return &rerankOutputs{ret} } @@ -157,7 +161,7 @@ func appendResult[T PKType](inputs *rerankInputs, outputs *rerankOutputs, idScor scores := idScores.scores outputs.searchResultData.Topks = append(outputs.searchResultData.Topks, int64(len(ids))) outputs.searchResultData.Scores = append(outputs.searchResultData.Scores, scores...) - if len(inputs.fieldData) > 0 { + if len(inputs.fieldData) > 0 && len(outputs.searchResultData.FieldsData) > 0 { for idx := range ids { loc := idScores.locations[idx] typeutil.AppendFieldData(outputs.searchResultData.FieldsData, inputs.fieldData[loc.batchIdx].GetFieldsData(), int64(loc.offset)) diff --git a/internal/util/function/rerank/util_test.go b/internal/util/function/rerank/util_test.go index b650985238..ec6d4c4071 100644 --- a/internal/util/function/rerank/util_test.go +++ b/internal/util/function/rerank/util_test.go @@ -202,3 +202,139 @@ func (s *UtilSuite) TestIsCrossMetrics() { s.True(descending) } } + +// TestNewRerankOutputsWithEmptyFieldsData tests newRerankOutputs when FieldsData is empty (requery scenario) +func (s *UtilSuite) TestNewRerankOutputsWithEmptyFieldsData() { + // Test case 1: All fieldData have empty FieldsData + { + inputs := &rerankInputs{ + fieldData: []*schemapb.SearchResultData{ + { + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{1, 2, 3}, + }, + }, + }, + FieldsData: []*schemapb.FieldData{}, // Empty + }, + { + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{4, 5, 6}, + }, + }, + }, + FieldsData: []*schemapb.FieldData{}, // Empty + }, + }, + } + searchParams := &SearchParams{limit: 10} + outputs := newRerankOutputs(inputs, searchParams) + s.NotNil(outputs) + // FieldsData should be empty since all inputs were empty + s.Equal(0, len(outputs.searchResultData.FieldsData)) + } + + // Test case 2: First fieldData has empty FieldsData, second has data + { + inputs := &rerankInputs{ + fieldData: []*schemapb.SearchResultData{ + { + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{1, 2, 3}, + }, + }, + }, + FieldsData: []*schemapb.FieldData{}, // Empty + }, + { + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{4, 5, 6}, + }, + }, + }, + FieldsData: []*schemapb.FieldData{ + { + Type: schemapb.DataType_Int64, + FieldName: "field1", + FieldId: 100, + Field: &schemapb.FieldData_Scalars{ + Scalars: &schemapb.ScalarField{ + Data: &schemapb.ScalarField_LongData{ + LongData: &schemapb.LongArray{ + Data: []int64{40, 50, 60}, + }, + }, + }, + }, + }, + }, + }, + }, + } + searchParams := &SearchParams{limit: 10} + outputs := newRerankOutputs(inputs, searchParams) + s.NotNil(outputs) + // Should use the second fieldData which has non-empty FieldsData + s.Greater(len(outputs.searchResultData.FieldsData), 0) + } + + // Test case 3: nil fieldData + { + inputs := &rerankInputs{ + fieldData: []*schemapb.SearchResultData{nil, nil}, + } + searchParams := &SearchParams{limit: 10} + outputs := newRerankOutputs(inputs, searchParams) + s.NotNil(outputs) + // FieldsData should be empty + s.Equal(0, len(outputs.searchResultData.FieldsData)) + } +} + +// TestAppendResultWithEmptyFieldsData tests appendResult when FieldsData is empty +func (s *UtilSuite) TestAppendResultWithEmptyFieldsData() { + // Test case: appendResult should not panic when FieldsData is empty + inputs := &rerankInputs{ + fieldData: []*schemapb.SearchResultData{ + { + Ids: &schemapb.IDs{ + IdField: &schemapb.IDs_IntId{ + IntId: &schemapb.LongArray{ + Data: []int64{1, 2, 3}, + }, + }, + }, + FieldsData: []*schemapb.FieldData{}, // Empty + }, + }, + } + searchParams := &SearchParams{limit: 10} + outputs := newRerankOutputs(inputs, searchParams) + + // Create idScores with locations + idScores := &IDScores[int64]{ + ids: []int64{1, 2}, + scores: []float32{0.9, 0.8}, + locations: []IDLoc{{batchIdx: 0, offset: 0}, {batchIdx: 0, offset: 1}}, + } + + // This should not panic even when FieldsData is empty + s.NotPanics(func() { + appendResult(inputs, outputs, idScores) + }) + + // Verify that IDs and scores were appended correctly + s.Equal(int64(2), outputs.searchResultData.Topks[0]) + s.Equal([]float32{0.9, 0.8}, outputs.searchResultData.Scores) + s.Equal([]int64{1, 2}, outputs.searchResultData.Ids.GetIntId().Data) + // FieldsData should still be empty + s.Equal(0, len(outputs.searchResultData.FieldsData)) +}