Buqian Zheng 3c80083f51
feat: [Sparse Float Vector] add sparse vector support to milvus components (#30630)
add sparse float vector support to different milvus components,
including proxy, data node to receive and write sparse float vectors to
binlog, query node to handle search requests, index node to build index
for sparse float column, etc.

https://github.com/milvus-io/milvus/issues/29419

---------

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
2024-03-13 14:32:54 -07:00

1736 lines
50 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package segments
import (
"context"
"encoding/binary"
"fmt"
"math"
"math/rand"
"path"
"path/filepath"
"strconv"
"github.com/cockroachdb/errors"
"github.com/golang/protobuf/proto"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"github.com/milvus-io/milvus/internal/proto/indexpb"
"github.com/milvus-io/milvus/internal/proto/internalpb"
"github.com/milvus-io/milvus/internal/proto/planpb"
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/proto/segcorepb"
storage "github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/indexcgowrapper"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/mq/msgstream"
"github.com/milvus-io/milvus/pkg/util/funcutil"
"github.com/milvus-io/milvus/pkg/util/metric"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/testutils"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
const (
IndexFaissIDMap = "FLAT"
IndexFaissIVFFlat = "IVF_FLAT"
IndexFaissIVFPQ = "IVF_PQ"
IndexFaissIVFSQ8 = "IVF_SQ8"
IndexScaNN = "SCANN"
IndexFaissBinIDMap = "BIN_FLAT"
IndexFaissBinIVFFlat = "BIN_IVF_FLAT"
IndexHNSW = "HNSW"
IndexSparseWand = "SPARSE_WAND"
nlist = 100
m = 4
nbits = 8
nprobe = 8
efConstruction = 200
ef = 200
defaultTopK = 10
defaultRoundDecimal = 6
defaultNProb = 10
rowIDFieldID = 0
timestampFieldID = 1
metricTypeKey = common.MetricTypeKey
defaultDim = 128
defaultMetricType = metric.L2
dimKey = common.DimKey
defaultLocalStorage = "/tmp/milvus_test/querynode"
)
// ---------- unittest util functions ----------
// gen collection schema for
type vecFieldParam struct {
id int64
dim int
metricType string
vecType schemapb.DataType
fieldName string
}
type constFieldParam struct {
id int64
dataType schemapb.DataType
fieldName string
}
var simpleFloatVecField = vecFieldParam{
id: 100,
dim: defaultDim,
metricType: defaultMetricType,
vecType: schemapb.DataType_FloatVector,
fieldName: "floatVectorField",
}
var simpleBinVecField = vecFieldParam{
id: 101,
dim: defaultDim,
metricType: metric.JACCARD,
vecType: schemapb.DataType_BinaryVector,
fieldName: "binVectorField",
}
var simpleFloat16VecField = vecFieldParam{
id: 112,
dim: defaultDim,
metricType: defaultMetricType,
vecType: schemapb.DataType_Float16Vector,
fieldName: "float16VectorField",
}
var simpleBFloat16VecField = vecFieldParam{
id: 113,
dim: defaultDim,
metricType: defaultMetricType,
vecType: schemapb.DataType_BFloat16Vector,
fieldName: "bfloat16VectorField",
}
var simpleSparseFloatVectorField = vecFieldParam{
id: 114,
metricType: metric.IP,
vecType: schemapb.DataType_SparseFloatVector,
fieldName: "sparseFloatVectorField",
}
var simpleBoolField = constFieldParam{
id: 102,
dataType: schemapb.DataType_Bool,
fieldName: "boolField",
}
var simpleInt8Field = constFieldParam{
id: 103,
dataType: schemapb.DataType_Int8,
fieldName: "int8Field",
}
var simpleInt16Field = constFieldParam{
id: 104,
dataType: schemapb.DataType_Int16,
fieldName: "int16Field",
}
var simpleInt32Field = constFieldParam{
id: 105,
dataType: schemapb.DataType_Int32,
fieldName: "int32Field",
}
var simpleInt64Field = constFieldParam{
id: 106,
dataType: schemapb.DataType_Int64,
fieldName: "int64Field",
}
var simpleFloatField = constFieldParam{
id: 107,
dataType: schemapb.DataType_Float,
fieldName: "floatField",
}
var simpleDoubleField = constFieldParam{
id: 108,
dataType: schemapb.DataType_Double,
fieldName: "doubleField",
}
var simpleJSONField = constFieldParam{
id: 109,
dataType: schemapb.DataType_JSON,
fieldName: "jsonField",
}
var simpleArrayField = constFieldParam{
id: 110,
dataType: schemapb.DataType_Array,
fieldName: "arrayField",
}
var simpleVarCharField = constFieldParam{
id: 111,
dataType: schemapb.DataType_VarChar,
fieldName: "varCharField",
}
var rowIDField = constFieldParam{
id: rowIDFieldID,
dataType: schemapb.DataType_Int64,
fieldName: "RowID",
}
var timestampField = constFieldParam{
id: timestampFieldID,
dataType: schemapb.DataType_Int64,
fieldName: "Timestamp",
}
func genConstantFieldSchema(param constFieldParam) *schemapb.FieldSchema {
field := &schemapb.FieldSchema{
FieldID: param.id,
Name: param.fieldName,
IsPrimaryKey: false,
DataType: param.dataType,
ElementType: schemapb.DataType_Int32,
}
return field
}
func genPKFieldSchema(param constFieldParam) *schemapb.FieldSchema {
field := &schemapb.FieldSchema{
FieldID: param.id,
Name: param.fieldName,
IsPrimaryKey: true,
DataType: param.dataType,
}
if param.dataType == schemapb.DataType_VarChar {
field.TypeParams = []*commonpb.KeyValuePair{
{Key: common.MaxLengthKey, Value: "12"},
}
}
return field
}
func genVectorFieldSchema(param vecFieldParam) *schemapb.FieldSchema {
fieldVec := &schemapb.FieldSchema{
FieldID: param.id,
Name: param.fieldName,
IsPrimaryKey: false,
DataType: param.vecType,
IndexParams: []*commonpb.KeyValuePair{
{
Key: metricTypeKey,
Value: param.metricType,
},
},
}
if fieldVec.DataType != schemapb.DataType_SparseFloatVector {
fieldVec.TypeParams = []*commonpb.KeyValuePair{
{
Key: dimKey,
Value: strconv.Itoa(param.dim),
},
}
}
return fieldVec
}
// some tests do not yet support sparse float vector, see comments of
// GenSparseFloatVecDataset in indexcgowrapper/dataset.go
func GenTestCollectionSchema(collectionName string, pkType schemapb.DataType, withSparse bool) *schemapb.CollectionSchema {
fieldRowID := genConstantFieldSchema(rowIDField)
fieldTimestamp := genConstantFieldSchema(timestampField)
fieldBool := genConstantFieldSchema(simpleBoolField)
fieldInt8 := genConstantFieldSchema(simpleInt8Field)
fieldInt16 := genConstantFieldSchema(simpleInt16Field)
fieldInt32 := genConstantFieldSchema(simpleInt32Field)
fieldFloat := genConstantFieldSchema(simpleFloatField)
fieldDouble := genConstantFieldSchema(simpleDoubleField)
// fieldArray := genConstantFieldSchema(simpleArrayField)
fieldJSON := genConstantFieldSchema(simpleJSONField)
fieldArray := genConstantFieldSchema(simpleArrayField)
floatVecFieldSchema := genVectorFieldSchema(simpleFloatVecField)
binVecFieldSchema := genVectorFieldSchema(simpleBinVecField)
var pkFieldSchema *schemapb.FieldSchema
switch pkType {
case schemapb.DataType_Int64:
pkFieldSchema = genPKFieldSchema(simpleInt64Field)
case schemapb.DataType_VarChar:
pkFieldSchema = genPKFieldSchema(simpleVarCharField)
}
schema := schemapb.CollectionSchema{ // schema for segCore
Name: collectionName,
AutoID: false,
Fields: []*schemapb.FieldSchema{
fieldBool,
fieldInt8,
fieldInt16,
fieldInt32,
fieldFloat,
fieldDouble,
fieldJSON,
floatVecFieldSchema,
binVecFieldSchema,
pkFieldSchema,
fieldArray,
},
}
if withSparse {
schema.Fields = append(schema.Fields, genVectorFieldSchema(simpleSparseFloatVectorField))
}
for i, field := range schema.GetFields() {
field.FieldID = 100 + int64(i)
}
schema.Fields = append(schema.Fields, fieldRowID, fieldTimestamp)
return &schema
}
func GenTestIndexInfoList(collectionID int64, schema *schemapb.CollectionSchema) []*indexpb.IndexInfo {
res := make([]*indexpb.IndexInfo, 0)
vectorFieldSchemas := typeutil.GetVectorFieldSchemas(schema)
for _, field := range vectorFieldSchemas {
index := &indexpb.IndexInfo{
CollectionID: collectionID,
FieldID: field.GetFieldID(),
// For now, a field can only have one index
// using fieldID and fieldName as indexID and indexName, just make sure not repeated.
IndexID: field.GetFieldID(),
IndexName: field.GetName(),
TypeParams: field.GetTypeParams(),
}
switch field.GetDataType() {
case schemapb.DataType_FloatVector, schemapb.DataType_Float16Vector:
{
index.IndexParams = []*commonpb.KeyValuePair{
{Key: common.MetricTypeKey, Value: metric.L2},
{Key: common.IndexTypeKey, Value: IndexFaissIVFFlat},
{Key: "nlist", Value: "128"},
}
}
case schemapb.DataType_BinaryVector:
{
index.IndexParams = []*commonpb.KeyValuePair{
{Key: common.MetricTypeKey, Value: metric.JACCARD},
{Key: common.IndexTypeKey, Value: IndexFaissBinIVFFlat},
{Key: "nlist", Value: "128"},
}
}
case schemapb.DataType_SparseFloatVector:
{
index.IndexParams = []*commonpb.KeyValuePair{
{Key: common.MetricTypeKey, Value: metric.IP},
{Key: common.IndexTypeKey, Value: IndexSparseWand},
{Key: "M", Value: "16"},
}
}
}
res = append(res, index)
}
return res
}
func GenTestIndexMeta(collectionID int64, schema *schemapb.CollectionSchema) *segcorepb.CollectionIndexMeta {
indexInfos := GenTestIndexInfoList(collectionID, schema)
fieldIndexMetas := make([]*segcorepb.FieldIndexMeta, 0)
for _, info := range indexInfos {
fieldIndexMetas = append(fieldIndexMetas, &segcorepb.FieldIndexMeta{
CollectionID: info.GetCollectionID(),
FieldID: info.GetFieldID(),
IndexName: info.GetIndexName(),
TypeParams: info.GetTypeParams(),
IndexParams: info.GetIndexParams(),
IsAutoIndex: info.GetIsAutoIndex(),
UserIndexParams: info.GetUserIndexParams(),
})
}
sizePerRecord, err := typeutil.EstimateSizePerRecord(schema)
maxIndexRecordPerSegment := int64(0)
if err != nil || sizePerRecord == 0 {
log.Warn("failed to transfer segment size to collection, because failed to estimate size per record", zap.Error(err))
} else {
threshold := paramtable.Get().DataCoordCfg.SegmentMaxSize.GetAsFloat() * 1024 * 1024
proportion := paramtable.Get().DataCoordCfg.SegmentSealProportion.GetAsFloat()
maxIndexRecordPerSegment = int64(threshold * proportion / float64(sizePerRecord))
}
indexMeta := segcorepb.CollectionIndexMeta{
MaxIndexRowCount: maxIndexRecordPerSegment,
IndexMetas: fieldIndexMetas,
}
return &indexMeta
}
// ---------- unittest util functions ----------
// gen field data
func generateBoolArray(numRows int) []bool {
ret := make([]bool, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, rand.Int()%2 == 0)
}
return ret
}
func generateInt8Array(numRows int) []int8 {
ret := make([]int8, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, int8(rand.Int()))
}
return ret
}
func generateInt16Array(numRows int) []int16 {
ret := make([]int16, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, int16(rand.Int()))
}
return ret
}
func generateInt32Array(numRows int) []int32 {
ret := make([]int32, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, rand.Int31())
}
return ret
}
func generateInt64Array(numRows int) []int64 {
ret := make([]int64, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, int64(i))
}
return ret
}
func generateFloat32Array(numRows int) []float32 {
ret := make([]float32, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, rand.Float32())
}
return ret
}
func generateStringArray(numRows int) []string {
ret := make([]string, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, strconv.Itoa(rand.Int()))
}
return ret
}
func generateArrayArray(numRows int) []*schemapb.ScalarField {
ret := make([]*schemapb.ScalarField, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, &schemapb.ScalarField{
Data: &schemapb.ScalarField_IntData{
IntData: &schemapb.IntArray{
Data: generateInt32Array(10),
},
},
})
}
return ret
}
func generateJSONArray(numRows int) [][]byte {
ret := make([][]byte, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, []byte(fmt.Sprintf(`{"key":%d}`, i+1)))
}
return ret
}
func generateFloat64Array(numRows int) []float64 {
ret := make([]float64, 0, numRows)
for i := 0; i < numRows; i++ {
ret = append(ret, rand.Float64())
}
return ret
}
func generateFloatVectors(numRows, dim int) []float32 {
total := numRows * dim
ret := make([]float32, 0, total)
for i := 0; i < total; i++ {
ret = append(ret, rand.Float32())
}
return ret
}
func generateBinaryVectors(numRows, dim int) []byte {
total := (numRows * dim) / 8
ret := make([]byte, total)
_, err := rand.Read(ret)
if err != nil {
panic(err)
}
return ret
}
func generateFloat16Vectors(numRows, dim int) []byte {
total := numRows * dim * 2
ret := make([]byte, total)
_, err := rand.Read(ret)
if err != nil {
panic(err)
}
return ret
}
func generateBFloat16Vectors(numRows, dim int) []byte {
total := numRows * dim * 2
ret := make([]byte, total)
_, err := rand.Read(ret)
if err != nil {
panic(err)
}
return ret
}
func GenTestScalarFieldData(dType schemapb.DataType, fieldName string, fieldID int64, numRows int) *schemapb.FieldData {
ret := &schemapb.FieldData{
Type: dType,
FieldName: fieldName,
Field: nil,
}
switch dType {
case schemapb.DataType_Bool:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_BoolData{
BoolData: &schemapb.BoolArray{
Data: generateBoolArray(numRows),
},
},
},
}
case schemapb.DataType_Int8:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_IntData{
IntData: &schemapb.IntArray{
Data: generateInt32Array(numRows),
},
},
},
}
case schemapb.DataType_Int16:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_IntData{
IntData: &schemapb.IntArray{
Data: generateInt32Array(numRows),
},
},
},
}
case schemapb.DataType_Int32:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_IntData{
IntData: &schemapb.IntArray{
Data: generateInt32Array(numRows),
},
},
},
}
case schemapb.DataType_Int64:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_LongData{
LongData: &schemapb.LongArray{
Data: generateInt64Array(numRows),
},
},
},
}
case schemapb.DataType_Float:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_FloatData{
FloatData: &schemapb.FloatArray{
Data: generateFloat32Array(numRows),
},
},
},
}
case schemapb.DataType_Double:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_DoubleData{
DoubleData: &schemapb.DoubleArray{
Data: generateFloat64Array(numRows),
},
},
},
}
case schemapb.DataType_VarChar:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_StringData{
StringData: &schemapb.StringArray{
Data: generateStringArray(numRows),
},
},
},
}
case schemapb.DataType_Array:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_ArrayData{
ArrayData: &schemapb.ArrayArray{
Data: generateArrayArray(numRows),
},
},
},
}
case schemapb.DataType_JSON:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_JsonData{
JsonData: &schemapb.JSONArray{
Data: generateJSONArray(numRows),
},
},
},
}
default:
panic("data type not supported")
}
return ret
}
// dim is ignored for sparse
func GenTestVectorFiledData(dType schemapb.DataType, fieldName string, fieldID int64, numRows int, dim int) *schemapb.FieldData {
ret := &schemapb.FieldData{
Type: dType,
FieldName: fieldName,
Field: nil,
}
switch dType {
case schemapb.DataType_BinaryVector:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: int64(dim),
Data: &schemapb.VectorField_BinaryVector{
BinaryVector: generateBinaryVectors(numRows, dim),
},
},
}
case schemapb.DataType_FloatVector:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: int64(dim),
Data: &schemapb.VectorField_FloatVector{
FloatVector: &schemapb.FloatArray{
Data: generateFloatVectors(numRows, dim),
},
},
},
}
case schemapb.DataType_Float16Vector:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: int64(dim),
Data: &schemapb.VectorField_Float16Vector{
Float16Vector: generateFloat16Vectors(numRows, dim),
},
},
}
case schemapb.DataType_BFloat16Vector:
ret.FieldId = fieldID
ret.Field = &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: int64(dim),
Data: &schemapb.VectorField_Bfloat16Vector{
Bfloat16Vector: generateBFloat16Vectors(numRows, dim),
},
},
}
case schemapb.DataType_SparseFloatVector:
ret.FieldId = fieldID
sparseData := testutils.GenerateSparseFloatVectors(numRows)
ret.Field = &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: sparseData.Dim,
Data: &schemapb.VectorField_SparseFloatVector{
SparseFloatVector: &schemapb.SparseFloatArray{
Dim: sparseData.Dim,
Contents: sparseData.Contents,
},
},
},
}
default:
panic("data type not supported")
}
return ret
}
func NewTestChunkManagerFactory(params *paramtable.ComponentParam, rootPath string) *storage.ChunkManagerFactory {
return storage.NewChunkManagerFactory("minio",
storage.RootPath(rootPath),
storage.Address(params.MinioCfg.Address.GetValue()),
storage.AccessKeyID(params.MinioCfg.AccessKeyID.GetValue()),
storage.SecretAccessKeyID(params.MinioCfg.SecretAccessKey.GetValue()),
storage.UseSSL(params.MinioCfg.UseSSL.GetAsBool()),
storage.BucketName(params.MinioCfg.BucketName.GetValue()),
storage.UseIAM(params.MinioCfg.UseIAM.GetAsBool()),
storage.CloudProvider(params.MinioCfg.CloudProvider.GetValue()),
storage.IAMEndpoint(params.MinioCfg.IAMEndpoint.GetValue()),
storage.CreateBucket(true))
}
func SaveBinLog(ctx context.Context,
collectionID int64,
partitionID int64,
segmentID int64,
msgLength int,
schema *schemapb.CollectionSchema,
chunkManager storage.ChunkManager,
) ([]*datapb.FieldBinlog, []*datapb.FieldBinlog, error) {
binLogs, statsLogs, err := genStorageBlob(collectionID,
partitionID,
segmentID,
msgLength,
schema)
if err != nil {
log.Warn("getStorageBlob return error", zap.Error(err))
return nil, nil, err
}
log.Debug(".. [query node unittest] Saving bin logs to MinIO ..", zap.Int("number", len(binLogs)))
kvs := make(map[string][]byte, len(binLogs))
// write insert binlog
fieldBinlog := make([]*datapb.FieldBinlog, 0)
for _, blob := range binLogs {
fieldID, err := strconv.ParseInt(blob.GetKey(), 10, 64)
log.Debug("[query node unittest] save binlog", zap.Int64("fieldID", fieldID))
if err != nil {
return nil, nil, err
}
k := JoinIDPath(collectionID, partitionID, segmentID, fieldID)
key := path.Join(chunkManager.RootPath(), "insert-log", k)
kvs[key] = blob.Value
fieldBinlog = append(fieldBinlog, &datapb.FieldBinlog{
FieldID: fieldID,
Binlogs: []*datapb.Binlog{{
LogPath: key,
EntriesNum: blob.RowNum,
}},
})
}
log.Debug("[query node unittest] save binlog file to MinIO/S3")
// write insert binlog
statsBinlog := make([]*datapb.FieldBinlog, 0)
for _, blob := range statsLogs {
fieldID, err := strconv.ParseInt(blob.GetKey(), 10, 64)
log.Debug("[query node unittest] save statLog", zap.Int64("fieldID", fieldID))
if err != nil {
return nil, nil, err
}
k := JoinIDPath(collectionID, partitionID, segmentID, fieldID)
key := path.Join(chunkManager.RootPath(), "stats-log", k)
kvs[key] = blob.Value[:]
statsBinlog = append(statsBinlog, &datapb.FieldBinlog{
FieldID: fieldID,
Binlogs: []*datapb.Binlog{{LogPath: key}},
})
}
log.Debug("[query node unittest] save statsLog file to MinIO/S3")
err = chunkManager.MultiWrite(ctx, kvs)
return fieldBinlog, statsBinlog, err
}
func genStorageBlob(collectionID int64,
partitionID int64,
segmentID int64,
msgLength int,
schema *schemapb.CollectionSchema,
) ([]*storage.Blob, []*storage.Blob, error) {
collMeta := genCollectionMeta(collectionID, partitionID, schema)
inCodec := storage.NewInsertCodecWithSchema(collMeta)
insertData, err := genInsertData(msgLength, schema)
if err != nil {
return nil, nil, err
}
binLogs, err := inCodec.Serialize(partitionID, segmentID, insertData)
if err != nil {
return nil, nil, err
}
statsLog, err := inCodec.SerializePkStatsByData(insertData)
if err != nil {
return nil, nil, err
}
return binLogs, []*storage.Blob{statsLog}, nil
}
func genCollectionMeta(collectionID int64, partitionID int64, schema *schemapb.CollectionSchema) *etcdpb.CollectionMeta {
colInfo := &etcdpb.CollectionMeta{
ID: collectionID,
Schema: schema,
PartitionIDs: []int64{partitionID},
}
return colInfo
}
func genInsertData(msgLength int, schema *schemapb.CollectionSchema) (*storage.InsertData, error) {
insertData := &storage.InsertData{
Data: make(map[int64]storage.FieldData),
}
for _, f := range schema.Fields {
switch f.DataType {
case schemapb.DataType_Bool:
insertData.Data[f.FieldID] = &storage.BoolFieldData{
Data: generateBoolArray(msgLength),
}
case schemapb.DataType_Int8:
insertData.Data[f.FieldID] = &storage.Int8FieldData{
Data: generateInt8Array(msgLength),
}
case schemapb.DataType_Int16:
insertData.Data[f.FieldID] = &storage.Int16FieldData{
Data: generateInt16Array(msgLength),
}
case schemapb.DataType_Int32:
insertData.Data[f.FieldID] = &storage.Int32FieldData{
Data: generateInt32Array(msgLength),
}
case schemapb.DataType_Int64:
insertData.Data[f.FieldID] = &storage.Int64FieldData{
Data: generateInt64Array(msgLength),
}
case schemapb.DataType_Float:
insertData.Data[f.FieldID] = &storage.FloatFieldData{
Data: generateFloat32Array(msgLength),
}
case schemapb.DataType_Double:
insertData.Data[f.FieldID] = &storage.DoubleFieldData{
Data: generateFloat64Array(msgLength),
}
case schemapb.DataType_String, schemapb.DataType_VarChar:
insertData.Data[f.FieldID] = &storage.StringFieldData{
Data: generateStringArray(msgLength),
}
case schemapb.DataType_Array:
insertData.Data[f.FieldID] = &storage.ArrayFieldData{
ElementType: schemapb.DataType_Int32,
Data: generateArrayArray(msgLength),
}
case schemapb.DataType_JSON:
insertData.Data[f.FieldID] = &storage.JSONFieldData{
Data: generateJSONArray(msgLength),
}
case schemapb.DataType_FloatVector:
dim := simpleFloatVecField.dim // if no dim specified, use simpleFloatVecField's dim
insertData.Data[f.FieldID] = &storage.FloatVectorFieldData{
Data: generateFloatVectors(msgLength, dim),
Dim: dim,
}
case schemapb.DataType_Float16Vector:
dim := simpleFloat16VecField.dim
insertData.Data[f.FieldID] = &storage.Float16VectorFieldData{
Data: generateFloat16Vectors(msgLength, dim),
Dim: dim,
}
case schemapb.DataType_BFloat16Vector:
dim := simpleFloat16VecField.dim
insertData.Data[f.FieldID] = &storage.BFloat16VectorFieldData{
Data: generateBFloat16Vectors(msgLength, dim),
Dim: dim,
}
case schemapb.DataType_BinaryVector:
dim := simpleBinVecField.dim
insertData.Data[f.FieldID] = &storage.BinaryVectorFieldData{
Data: generateBinaryVectors(msgLength, dim),
Dim: dim,
}
case schemapb.DataType_SparseFloatVector:
sparseData := testutils.GenerateSparseFloatVectors(msgLength)
insertData.Data[f.FieldID] = &storage.SparseFloatVectorFieldData{
SparseFloatArray: *sparseData,
}
default:
err := errors.New("data type not supported")
return nil, err
}
}
// set data for rowID field
insertData.Data[rowIDFieldID] = &storage.Int64FieldData{
Data: generateInt64Array(msgLength),
}
// set data for ts field
insertData.Data[timestampFieldID] = &storage.Int64FieldData{
Data: genTimestampFieldData(msgLength),
}
return insertData, nil
}
func genTimestampFieldData(numRows int) []int64 {
times := make([]int64, numRows)
for i := 0; i < numRows; i++ {
times[i] = int64(i)
}
// timestamp 0 is not allowed
times[0] = 1
return times
}
func SaveDeltaLog(collectionID int64,
partitionID int64,
segmentID int64,
cm storage.ChunkManager,
) ([]*datapb.FieldBinlog, error) {
binlogWriter := storage.NewDeleteBinlogWriter(schemapb.DataType_String, collectionID, partitionID, segmentID)
eventWriter, _ := binlogWriter.NextDeleteEventWriter()
dData := &storage.DeleteData{
Pks: []storage.PrimaryKey{storage.NewInt64PrimaryKey(1), storage.NewInt64PrimaryKey(2)},
Tss: []typeutil.Timestamp{100, 200},
RowCount: 2,
}
sizeTotal := 0
for i := int64(0); i < dData.RowCount; i++ {
int64PkValue := dData.Pks[i].(*storage.Int64PrimaryKey).Value
ts := dData.Tss[i]
eventWriter.AddOneStringToPayload(fmt.Sprintf("%d,%d", int64PkValue, ts))
sizeTotal += binary.Size(int64PkValue)
sizeTotal += binary.Size(ts)
}
eventWriter.SetEventTimestamp(100, 200)
binlogWriter.SetEventTimeStamp(100, 200)
binlogWriter.AddExtra("original_size", fmt.Sprintf("%v", sizeTotal))
binlogWriter.Finish()
buffer, _ := binlogWriter.GetBuffer()
blob := &storage.Blob{Key: "deltaLogPath1", Value: buffer}
kvs := make(map[string][]byte, 1)
// write delta log
pkFieldID := int64(106)
fieldBinlog := make([]*datapb.FieldBinlog, 0)
log.Debug("[query node unittest] save delta log", zap.Int64("fieldID", pkFieldID))
key := JoinIDPath(collectionID, partitionID, segmentID, pkFieldID)
// keyPath := path.Join(defaultLocalStorage, "delta-log", key)
keyPath := path.Join(cm.RootPath(), "delta-log", key)
kvs[keyPath] = blob.Value[:]
fieldBinlog = append(fieldBinlog, &datapb.FieldBinlog{
FieldID: pkFieldID,
Binlogs: []*datapb.Binlog{{
LogPath: keyPath,
TimestampFrom: 100,
TimestampTo: 200,
}},
})
log.Debug("[query node unittest] save delta log file to MinIO/S3")
return fieldBinlog, cm.MultiWrite(context.Background(), kvs)
}
func GenAndSaveIndexV2(collectionID, partitionID, segmentID, buildID int64,
fieldSchema *schemapb.FieldSchema,
indexInfo *indexpb.IndexInfo,
cm storage.ChunkManager,
msgLength int,
) (*querypb.FieldIndexInfo, error) {
typeParams := funcutil.KeyValuePair2Map(indexInfo.GetTypeParams())
indexParams := funcutil.KeyValuePair2Map(indexInfo.GetIndexParams())
index, err := indexcgowrapper.NewCgoIndex(fieldSchema.GetDataType(), typeParams, indexParams)
if err != nil {
return nil, err
}
defer index.Delete()
var dataset *indexcgowrapper.Dataset
switch fieldSchema.DataType {
case schemapb.DataType_BinaryVector:
dataset = indexcgowrapper.GenBinaryVecDataset(generateBinaryVectors(msgLength, defaultDim))
case schemapb.DataType_FloatVector:
dataset = indexcgowrapper.GenFloatVecDataset(generateFloatVectors(msgLength, defaultDim))
case schemapb.DataType_SparseFloatVector:
data := testutils.GenerateSparseFloatVectors(msgLength)
dataset = indexcgowrapper.GenSparseFloatVecDataset(&storage.SparseFloatVectorFieldData{
SparseFloatArray: *data,
})
}
err = index.Build(dataset)
if err != nil {
return nil, err
}
// save index to minio
binarySet, err := index.Serialize()
if err != nil {
return nil, err
}
// serialize index params
indexCodec := storage.NewIndexFileBinlogCodec()
serializedIndexBlobs, err := indexCodec.Serialize(
buildID,
0,
collectionID,
partitionID,
segmentID,
fieldSchema.GetFieldID(),
indexParams,
indexInfo.GetIndexName(),
indexInfo.GetIndexID(),
binarySet,
)
if err != nil {
return nil, err
}
indexPaths := make([]string, 0)
for _, index := range serializedIndexBlobs {
indexPath := filepath.Join(cm.RootPath(), "index_files",
strconv.Itoa(int(segmentID)), index.Key)
indexPaths = append(indexPaths, indexPath)
err := cm.Write(context.Background(), indexPath, index.Value)
if err != nil {
return nil, err
}
}
_, cCurrentIndexVersion := getIndexEngineVersion()
return &querypb.FieldIndexInfo{
FieldID: fieldSchema.GetFieldID(),
EnableIndex: true,
IndexName: indexInfo.GetIndexName(),
IndexParams: indexInfo.GetIndexParams(),
IndexFilePaths: indexPaths,
CurrentIndexVersion: cCurrentIndexVersion,
}, nil
}
func GenAndSaveIndex(collectionID, partitionID, segmentID, fieldID int64, msgLength int, indexType, metricType string, cm storage.ChunkManager) (*querypb.FieldIndexInfo, error) {
typeParams, indexParams := genIndexParams(indexType, metricType)
index, err := indexcgowrapper.NewCgoIndex(schemapb.DataType_FloatVector, typeParams, indexParams)
if err != nil {
return nil, err
}
defer index.Delete()
err = index.Build(indexcgowrapper.GenFloatVecDataset(generateFloatVectors(msgLength, defaultDim)))
if err != nil {
return nil, err
}
// save index to minio
binarySet, err := index.Serialize()
if err != nil {
return nil, err
}
// serialize index params
indexCodec := storage.NewIndexFileBinlogCodec()
serializedIndexBlobs, err := indexCodec.Serialize(
0,
0,
collectionID,
partitionID,
segmentID,
simpleFloatVecField.id,
indexParams,
"querynode-test",
0,
binarySet,
)
if err != nil {
return nil, err
}
indexPaths := make([]string, 0)
for _, index := range serializedIndexBlobs {
// indexPath := filepath.Join(defaultLocalStorage, strconv.Itoa(int(segmentID)), index.Key)
indexPath := filepath.Join(cm.RootPath(), "index_files",
strconv.Itoa(int(segmentID)), index.Key)
indexPaths = append(indexPaths, indexPath)
err := cm.Write(context.Background(), indexPath, index.Value)
if err != nil {
return nil, err
}
}
_, cCurrentIndexVersion := getIndexEngineVersion()
return &querypb.FieldIndexInfo{
FieldID: fieldID,
EnableIndex: true,
IndexName: "querynode-test",
IndexParams: funcutil.Map2KeyValuePair(indexParams),
IndexFilePaths: indexPaths,
CurrentIndexVersion: cCurrentIndexVersion,
}, nil
}
func genIndexParams(indexType, metricType string) (map[string]string, map[string]string) {
typeParams := make(map[string]string)
typeParams[common.DimKey] = strconv.Itoa(defaultDim)
indexParams := make(map[string]string)
indexParams[common.IndexTypeKey] = indexType
indexParams[common.MetricTypeKey] = metricType
indexParams["index_mode"] = "cpu"
if indexType == IndexFaissIDMap { // float vector
} else if indexType == IndexFaissIVFFlat {
indexParams["nlist"] = strconv.Itoa(nlist)
} else if indexType == IndexFaissIVFPQ {
indexParams["nlist"] = strconv.Itoa(nlist)
indexParams["m"] = strconv.Itoa(m)
indexParams["nbits"] = strconv.Itoa(nbits)
} else if indexType == IndexFaissIVFSQ8 {
indexParams["nlist"] = strconv.Itoa(nlist)
indexParams["nbits"] = strconv.Itoa(nbits)
} else if indexType == IndexHNSW {
indexParams["M"] = strconv.Itoa(16)
indexParams["efConstruction"] = strconv.Itoa(efConstruction)
// indexParams["ef"] = strconv.Itoa(ef)
} else if indexType == IndexFaissBinIVFFlat { // binary vector
indexParams["nlist"] = strconv.Itoa(nlist)
indexParams["m"] = strconv.Itoa(m)
indexParams["nbits"] = strconv.Itoa(nbits)
} else if indexType == IndexFaissBinIDMap {
// indexParams[common.DimKey] = strconv.Itoa(defaultDim)
} else {
panic("")
}
return typeParams, indexParams
}
func genStorageConfig() *indexpb.StorageConfig {
return &indexpb.StorageConfig{
Address: paramtable.Get().MinioCfg.Address.GetValue(),
AccessKeyID: paramtable.Get().MinioCfg.AccessKeyID.GetValue(),
SecretAccessKey: paramtable.Get().MinioCfg.SecretAccessKey.GetValue(),
BucketName: paramtable.Get().MinioCfg.BucketName.GetValue(),
RootPath: paramtable.Get().MinioCfg.RootPath.GetValue(),
IAMEndpoint: paramtable.Get().MinioCfg.IAMEndpoint.GetValue(),
UseSSL: paramtable.Get().MinioCfg.UseSSL.GetAsBool(),
UseIAM: paramtable.Get().MinioCfg.UseIAM.GetAsBool(),
StorageType: paramtable.Get().CommonCfg.StorageType.GetValue(),
}
}
func genSearchRequest(nq int64, indexType string, collection *Collection) (*internalpb.SearchRequest, error) {
placeHolder, err := genPlaceHolderGroup(nq)
if err != nil {
return nil, err
}
planStr, err2 := genDSLByIndexType(collection.Schema(), indexType)
if err2 != nil {
return nil, err2
}
var planpb planpb.PlanNode
proto.UnmarshalText(planStr, &planpb)
serializedPlan, err3 := proto.Marshal(&planpb)
if err3 != nil {
return nil, err3
}
return &internalpb.SearchRequest{
Base: genCommonMsgBase(commonpb.MsgType_Search, 0),
CollectionID: collection.ID(),
PartitionIDs: collection.GetPartitions(),
PlaceholderGroup: placeHolder,
SerializedExprPlan: serializedPlan,
DslType: commonpb.DslType_BoolExprV1,
Nq: nq,
}, nil
}
func genCommonMsgBase(msgType commonpb.MsgType, targetID int64) *commonpb.MsgBase {
return &commonpb.MsgBase{
MsgType: msgType,
MsgID: rand.Int63(),
TargetID: targetID,
}
}
func genPlaceHolderGroup(nq int64) ([]byte, error) {
placeholderValue := &commonpb.PlaceholderValue{
Tag: "$0",
Type: commonpb.PlaceholderType_FloatVector,
Values: make([][]byte, 0),
}
for i := int64(0); i < nq; i++ {
vec := make([]float32, defaultDim)
for j := 0; j < defaultDim; j++ {
vec[j] = rand.Float32()
}
var rawData []byte
for k, ele := range vec {
buf := make([]byte, 4)
common.Endian.PutUint32(buf, math.Float32bits(ele+float32(k*2)))
rawData = append(rawData, buf...)
}
placeholderValue.Values = append(placeholderValue.Values, rawData)
}
// generate placeholder
placeholderGroup := commonpb.PlaceholderGroup{
Placeholders: []*commonpb.PlaceholderValue{placeholderValue},
}
placeGroupByte, err := proto.Marshal(&placeholderGroup)
if err != nil {
return nil, err
}
return placeGroupByte, nil
}
func genDSLByIndexType(schema *schemapb.CollectionSchema, indexType string) (string, error) {
if indexType == IndexFaissIDMap { // float vector
return genBruteForceDSL(schema, defaultTopK, defaultRoundDecimal)
} else if indexType == IndexHNSW {
return genHNSWDSL(schema, ef, defaultTopK, defaultRoundDecimal)
}
return "", fmt.Errorf("Invalid indexType")
}
func genBruteForceDSL(schema *schemapb.CollectionSchema, topK int64, roundDecimal int64) (string, error) {
var vecFieldName string
var metricType string
topKStr := strconv.FormatInt(topK, 10)
nProbStr := strconv.Itoa(defaultNProb)
roundDecimalStr := strconv.FormatInt(roundDecimal, 10)
var fieldID int64
for _, f := range schema.Fields {
if f.DataType == schemapb.DataType_FloatVector || f.DataType == schemapb.DataType_Float16Vector || f.DataType == schemapb.DataType_BFloat16Vector {
vecFieldName = f.Name
fieldID = f.FieldID
for _, p := range f.IndexParams {
if p.Key == metricTypeKey {
metricType = p.Value
}
}
}
}
if vecFieldName == "" || metricType == "" {
err := errors.New("invalid vector field name or metric type")
return "", err
}
return `vector_anns: <
field_id: ` + fmt.Sprintf("%d", fieldID) + `
query_info: <
topk: ` + topKStr + `
round_decimal: ` + roundDecimalStr + `
metric_type: "` + metricType + `"
search_params: "{\"nprobe\": ` + nProbStr + `}"
>
placeholder_tag: "$0"
>`, nil
}
func genHNSWDSL(schema *schemapb.CollectionSchema, ef int, topK int64, roundDecimal int64) (string, error) {
var vecFieldName string
var metricType string
efStr := strconv.Itoa(ef)
topKStr := strconv.FormatInt(topK, 10)
roundDecimalStr := strconv.FormatInt(roundDecimal, 10)
var fieldID int64
for _, f := range schema.Fields {
if f.DataType == schemapb.DataType_FloatVector {
vecFieldName = f.Name
fieldID = f.FieldID
for _, p := range f.IndexParams {
if p.Key == metricTypeKey {
metricType = p.Value
}
}
}
}
if vecFieldName == "" || metricType == "" {
err := errors.New("invalid vector field name or metric type")
return "", err
}
return `vector_anns: <
field_id: ` + fmt.Sprintf("%d", fieldID) + `
query_info: <
topk: ` + topKStr + `
round_decimal: ` + roundDecimalStr + `
metric_type: "` + metricType + `"
search_params: "{\"ef\": ` + efStr + `}"
>
placeholder_tag: "$0"
>`, nil
}
func checkSearchResult(ctx context.Context, nq int64, plan *SearchPlan, searchResult *SearchResult) error {
searchResults := make([]*SearchResult, 0)
searchResults = append(searchResults, searchResult)
topK := plan.getTopK()
sliceNQs := []int64{nq / 5, nq / 5, nq / 5, nq / 5, nq / 5}
sliceTopKs := []int64{topK, topK / 2, topK, topK, topK / 2}
sInfo := ParseSliceInfo(sliceNQs, sliceTopKs, nq)
res, err := ReduceSearchResultsAndFillData(ctx, plan, searchResults, 1, sInfo.SliceNQs, sInfo.SliceTopKs)
if err != nil {
return err
}
for i := 0; i < len(sInfo.SliceNQs); i++ {
blob, err := GetSearchResultDataBlob(ctx, res, i)
if err != nil {
return err
}
if len(blob) == 0 {
return fmt.Errorf("wrong search result data blobs when checkSearchResult")
}
result := &schemapb.SearchResultData{}
err = proto.Unmarshal(blob, result)
if err != nil {
return err
}
if result.TopK != sliceTopKs[i] {
return fmt.Errorf("unexpected topK when checkSearchResult")
}
if result.NumQueries != sInfo.SliceNQs[i] {
return fmt.Errorf("unexpected nq when checkSearchResult")
}
// search empty segment, return empty result.IDs
if len(result.Ids.IdField.(*schemapb.IDs_IntId).IntId.Data) <= 0 {
return fmt.Errorf("unexpected Ids when checkSearchResult")
}
if len(result.Scores) <= 0 {
return fmt.Errorf("unexpected Scores when checkSearchResult")
}
}
DeleteSearchResults(searchResults)
DeleteSearchResultDataBlobs(res)
return nil
}
func genSearchPlanAndRequests(collection *Collection, segments []int64, indexType string, nq int64) (*SearchRequest, error) {
iReq, _ := genSearchRequest(nq, indexType, collection)
queryReq := &querypb.SearchRequest{
Req: iReq,
DmlChannels: []string{"dml"},
SegmentIDs: segments,
FromShardLeader: true,
Scope: querypb.DataScope_Historical,
}
return NewSearchRequest(context.Background(), collection, queryReq, queryReq.Req.GetPlaceholderGroup())
}
func genInsertMsg(collection *Collection, partitionID, segment int64, numRows int) (*msgstream.InsertMsg, error) {
fieldsData := make([]*schemapb.FieldData, 0)
for _, f := range collection.Schema().Fields {
switch f.DataType {
case schemapb.DataType_Bool:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleBoolField.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_Int8:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleInt8Field.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_Int16:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleInt16Field.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_Int32:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleInt32Field.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_Int64:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleInt64Field.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_Float:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleFloatField.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_Double:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleDoubleField.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_VarChar:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleVarCharField.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_Array:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleArrayField.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_JSON:
fieldsData = append(fieldsData, GenTestScalarFieldData(f.DataType, simpleJSONField.fieldName, f.GetFieldID(), numRows))
case schemapb.DataType_FloatVector:
dim := simpleFloatVecField.dim // if no dim specified, use simpleFloatVecField's dim
fieldsData = append(fieldsData, GenTestVectorFiledData(f.DataType, f.Name, f.FieldID, numRows, dim))
case schemapb.DataType_BinaryVector:
dim := simpleBinVecField.dim // if no dim specified, use simpleFloatVecField's dim
fieldsData = append(fieldsData, GenTestVectorFiledData(f.DataType, f.Name, f.FieldID, numRows, dim))
case schemapb.DataType_Float16Vector:
dim := simpleFloat16VecField.dim // if no dim specified, use simpleFloatVecField's dim
fieldsData = append(fieldsData, GenTestVectorFiledData(f.DataType, f.Name, f.FieldID, numRows, dim))
case schemapb.DataType_BFloat16Vector:
dim := simpleBFloat16VecField.dim // if no dim specified, use simpleFloatVecField's dim
fieldsData = append(fieldsData, GenTestVectorFiledData(f.DataType, f.Name, f.FieldID, numRows, dim))
case schemapb.DataType_SparseFloatVector:
fieldsData = append(fieldsData, GenTestVectorFiledData(f.DataType, f.Name, f.FieldID, numRows, 0))
default:
err := errors.New("data type not supported")
return nil, err
}
}
return &msgstream.InsertMsg{
BaseMsg: genMsgStreamBaseMsg(),
InsertRequest: msgpb.InsertRequest{
Base: genCommonMsgBase(commonpb.MsgType_Insert, 0),
CollectionName: "test-collection",
PartitionName: "test-partition",
CollectionID: collection.ID(),
PartitionID: partitionID,
SegmentID: segment,
ShardName: "dml",
Timestamps: genSimpleTimestampFieldData(numRows),
RowIDs: genSimpleRowIDField(numRows),
FieldsData: fieldsData,
NumRows: uint64(numRows),
Version: msgpb.InsertDataVersion_ColumnBased,
},
}, nil
}
func genMsgStreamBaseMsg() msgstream.BaseMsg {
return msgstream.BaseMsg{
Ctx: context.Background(),
BeginTimestamp: 0,
EndTimestamp: 0,
HashValues: []uint32{0},
MsgPosition: &msgpb.MsgPosition{
ChannelName: "",
MsgID: []byte{},
MsgGroup: "",
Timestamp: 10,
},
}
}
func genSimpleTimestampFieldData(numRows int) []uint64 {
times := make([]uint64, numRows)
for i := 0; i < numRows; i++ {
times[i] = uint64(i)
}
// timestamp 0 is not allowed
times[0] = 1
return times
}
func genSimpleRowIDField(numRows int) []int64 {
ids := make([]int64, numRows)
for i := 0; i < numRows; i++ {
ids[i] = int64(i)
}
return ids
}
func genSimpleRetrievePlan(collection *Collection) (*RetrievePlan, error) {
timestamp := storage.Timestamp(1000)
planBytes, err := genSimpleRetrievePlanExpr(collection.schema.Load())
if err != nil {
return nil, err
}
plan, err2 := NewRetrievePlan(context.Background(), collection, planBytes, timestamp, 100)
return plan, err2
}
func genSimpleRetrievePlanExpr(schema *schemapb.CollectionSchema) ([]byte, error) {
pkField, err := typeutil.GetPrimaryFieldSchema(schema)
if err != nil {
return nil, err
}
planNode := &planpb.PlanNode{
Node: &planpb.PlanNode_Predicates{
Predicates: &planpb.Expr{
Expr: &planpb.Expr_TermExpr{
TermExpr: &planpb.TermExpr{
ColumnInfo: &planpb.ColumnInfo{
FieldId: pkField.FieldID,
DataType: pkField.DataType,
},
Values: []*planpb.GenericValue{
{
Val: &planpb.GenericValue_Int64Val{
Int64Val: 1,
},
},
{
Val: &planpb.GenericValue_Int64Val{
Int64Val: 2,
},
},
{
Val: &planpb.GenericValue_Int64Val{
Int64Val: 3,
},
},
},
},
},
},
},
OutputFieldIds: []int64{pkField.FieldID},
}
planExpr, err := proto.Marshal(planNode)
return planExpr, err
}
func genFieldData(fieldName string, fieldID int64, fieldType schemapb.DataType, fieldValue interface{}, dim int64) *schemapb.FieldData {
var fieldData *schemapb.FieldData
switch fieldType {
case schemapb.DataType_Bool:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_Bool,
FieldName: fieldName,
Field: &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_BoolData{
BoolData: &schemapb.BoolArray{
Data: fieldValue.([]bool),
},
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_Int32:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_Int32,
FieldName: fieldName,
Field: &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_IntData{
IntData: &schemapb.IntArray{
Data: fieldValue.([]int32),
},
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_Int64:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_Int64,
FieldName: fieldName,
Field: &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_LongData{
LongData: &schemapb.LongArray{
Data: fieldValue.([]int64),
},
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_Float:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_Float,
FieldName: fieldName,
Field: &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_FloatData{
FloatData: &schemapb.FloatArray{
Data: fieldValue.([]float32),
},
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_Double:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_Double,
FieldName: fieldName,
Field: &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_DoubleData{
DoubleData: &schemapb.DoubleArray{
Data: fieldValue.([]float64),
},
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_VarChar:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_VarChar,
FieldName: fieldName,
Field: &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_StringData{
StringData: &schemapb.StringArray{
Data: fieldValue.([]string),
},
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_BinaryVector:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_BinaryVector,
FieldName: fieldName,
Field: &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: dim,
Data: &schemapb.VectorField_BinaryVector{
BinaryVector: fieldValue.([]byte),
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_FloatVector:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_FloatVector,
FieldName: fieldName,
Field: &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: dim,
Data: &schemapb.VectorField_FloatVector{
FloatVector: &schemapb.FloatArray{
Data: fieldValue.([]float32),
},
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_Float16Vector:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_Float16Vector,
FieldName: fieldName,
Field: &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: dim,
Data: &schemapb.VectorField_Float16Vector{
Float16Vector: fieldValue.([]byte),
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_BFloat16Vector:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_BFloat16Vector,
FieldName: fieldName,
Field: &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: dim,
Data: &schemapb.VectorField_Bfloat16Vector{
Bfloat16Vector: fieldValue.([]byte),
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_JSON:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_JSON,
FieldName: fieldName,
Field: &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_JsonData{
JsonData: &schemapb.JSONArray{
Data: fieldValue.([][]byte),
},
},
},
},
FieldId: fieldID,
}
case schemapb.DataType_Array:
fieldData = &schemapb.FieldData{
Type: schemapb.DataType_Array,
FieldName: fieldName,
Field: &schemapb.FieldData_Scalars{
Scalars: &schemapb.ScalarField{
Data: &schemapb.ScalarField_ArrayData{
ArrayData: &schemapb.ArrayArray{
Data: fieldValue.([]*schemapb.ScalarField),
},
},
},
},
FieldId: fieldID,
}
default:
log.Error("not supported field type", zap.String("field type", fieldType.String()))
}
return fieldData
}
func genSearchResultData(nq int64, topk int64, ids []int64, scores []float32, topks []int64) *schemapb.SearchResultData {
return &schemapb.SearchResultData{
NumQueries: 1,
TopK: topk,
FieldsData: nil,
Scores: scores,
Ids: &schemapb.IDs{
IdField: &schemapb.IDs_IntId{
IntId: &schemapb.LongArray{
Data: ids,
},
},
},
Topks: topks,
}
}