mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 09:38:39 +08:00
enhance: bulkinsert handles nullable/default (#42127)
issue: https://github.com/milvus-io/milvus/issues/42096, https://github.com/milvus-io/milvus/issues/42130 Signed-off-by: yhmo <yihua.mo@zilliz.com>
This commit is contained in:
parent
79b51cbb73
commit
14563ad2b3
@ -104,6 +104,30 @@ func HashDeleteData(task Task, delData *storage.DeleteData) ([]*storage.DeleteDa
|
|||||||
return res, nil
|
return res, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this method is only for GetRowsStats() to get a row from storage.InsertData
|
||||||
|
// the GetRowsStats() is called by PreImportTask, some of nullable/default_value fields in the storage.InsertData could be zero row
|
||||||
|
func getRowFromInsertData(rows *storage.InsertData, i int) map[int64]interface{} {
|
||||||
|
res := make(map[int64]interface{})
|
||||||
|
for field, data := range rows.Data {
|
||||||
|
if data.RowNum() > i {
|
||||||
|
res[field] = data.GetRow(i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
// this method is only for GetRowsStats() to get a row from storage.InsertData
|
||||||
|
// the GetRowsStats() is called by PreImportTask, some of nullable/default_value fields in the storage.InsertData could be zero row
|
||||||
|
func getRowSizeFromInsertData(rows *storage.InsertData, i int) int {
|
||||||
|
size := 0
|
||||||
|
for _, data := range rows.Data {
|
||||||
|
if data.RowNum() > i {
|
||||||
|
size += data.GetRowSize(i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return size
|
||||||
|
}
|
||||||
|
|
||||||
func GetRowsStats(task Task, rows *storage.InsertData) (map[string]*datapb.PartitionImportStats, error) {
|
func GetRowsStats(task Task, rows *storage.InsertData) (map[string]*datapb.PartitionImportStats, error) {
|
||||||
var (
|
var (
|
||||||
schema = task.GetSchema()
|
schema = task.GetSchema()
|
||||||
@ -127,7 +151,7 @@ func GetRowsStats(task Task, rows *storage.InsertData) (map[string]*datapb.Parti
|
|||||||
hashDataSize[i] = make([]int, partitionNum)
|
hashDataSize[i] = make([]int, partitionNum)
|
||||||
}
|
}
|
||||||
|
|
||||||
rowNum := GetInsertDataRowCount(rows, schema)
|
rowNum, _ := GetInsertDataRowCount(rows, schema)
|
||||||
if pkField.GetAutoID() {
|
if pkField.GetAutoID() {
|
||||||
fn := hashByPartition(int64(partitionNum), partKeyField)
|
fn := hashByPartition(int64(partitionNum), partKeyField)
|
||||||
rows.Data = lo.PickBy(rows.Data, func(fieldID int64, _ storage.FieldData) bool {
|
rows.Data = lo.PickBy(rows.Data, func(fieldID int64, _ storage.FieldData) bool {
|
||||||
@ -136,9 +160,10 @@ func GetRowsStats(task Task, rows *storage.InsertData) (map[string]*datapb.Parti
|
|||||||
hashByPartRowsCount := make([]int, partitionNum)
|
hashByPartRowsCount := make([]int, partitionNum)
|
||||||
hashByPartDataSize := make([]int, partitionNum)
|
hashByPartDataSize := make([]int, partitionNum)
|
||||||
for i := 0; i < rowNum; i++ {
|
for i := 0; i < rowNum; i++ {
|
||||||
p := fn(rows.GetRow(i)[id2])
|
row := getRowFromInsertData(rows, i)
|
||||||
|
p := fn(row[id2])
|
||||||
hashByPartRowsCount[p]++
|
hashByPartRowsCount[p]++
|
||||||
hashByPartDataSize[p] += rows.GetRowSize(i)
|
hashByPartDataSize[p] += getRowSizeFromInsertData(rows, i)
|
||||||
}
|
}
|
||||||
// When autoID is enabled, the generated IDs will be evenly hashed across all channels.
|
// When autoID is enabled, the generated IDs will be evenly hashed across all channels.
|
||||||
// Therefore, here we just assign an average number of rows to each channel.
|
// Therefore, here we just assign an average number of rows to each channel.
|
||||||
@ -152,10 +177,10 @@ func GetRowsStats(task Task, rows *storage.InsertData) (map[string]*datapb.Parti
|
|||||||
f1 := hashByVChannel(int64(channelNum), pkField)
|
f1 := hashByVChannel(int64(channelNum), pkField)
|
||||||
f2 := hashByPartition(int64(partitionNum), partKeyField)
|
f2 := hashByPartition(int64(partitionNum), partKeyField)
|
||||||
for i := 0; i < rowNum; i++ {
|
for i := 0; i < rowNum; i++ {
|
||||||
row := rows.GetRow(i)
|
row := getRowFromInsertData(rows, i)
|
||||||
p1, p2 := f1(row[id1]), f2(row[id2])
|
p1, p2 := f1(row[id1]), f2(row[id2])
|
||||||
hashRowsCount[p1][p2]++
|
hashRowsCount[p1][p2]++
|
||||||
hashDataSize[p1][p2] += rows.GetRowSize(i)
|
hashDataSize[p1][p2] += getRowSizeFromInsertData(rows, i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -191,7 +191,7 @@ func (t *ImportTask) importFile(reader importutilv2.Reader) error {
|
|||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
rowNum := GetInsertDataRowCount(data, t.GetSchema())
|
rowNum, _ := GetInsertDataRowCount(data, t.GetSchema())
|
||||||
if rowNum == 0 {
|
if rowNum == 0 {
|
||||||
log.Info("0 row was imported, the data may have been deleted", WrapLogFields(t)...)
|
log.Info("0 row was imported, the data may have been deleted", WrapLogFields(t)...)
|
||||||
continue
|
continue
|
||||||
@ -200,6 +200,10 @@ func (t *ImportTask) importFile(reader importutilv2.Reader) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
err = AppendNullableDefaultFieldsData(t.GetSchema(), data, rowNum)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if !importutilv2.IsBackup(t.req.GetOptions()) {
|
if !importutilv2.IsBackup(t.req.GetOptions()) {
|
||||||
err = RunEmbeddingFunction(t, data)
|
err = RunEmbeddingFunction(t, data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -141,17 +141,9 @@ func CheckRowsEqual(schema *schemapb.CollectionSchema, data *storage.InsertData)
|
|||||||
return field.GetFieldID()
|
return field.GetFieldID()
|
||||||
})
|
})
|
||||||
|
|
||||||
var field int64
|
rows, field := GetInsertDataRowCount(data, schema)
|
||||||
var rows int
|
|
||||||
for fieldID, d := range data.Data {
|
for fieldID, d := range data.Data {
|
||||||
if idToField[fieldID].GetIsPrimaryKey() && idToField[fieldID].GetAutoID() {
|
if d.RowNum() == 0 && (CanBeZeroRowField(idToField[fieldID])) {
|
||||||
continue
|
|
||||||
}
|
|
||||||
field, rows = fieldID, d.RowNum()
|
|
||||||
break
|
|
||||||
}
|
|
||||||
for fieldID, d := range data.Data {
|
|
||||||
if idToField[fieldID].GetIsPrimaryKey() && idToField[fieldID].GetAutoID() {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if d.RowNum() != rows {
|
if d.RowNum() != rows {
|
||||||
@ -201,6 +193,156 @@ func AppendSystemFieldsData(task *ImportTask, data *storage.InsertData, rowNum i
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type nullDefaultAppender[T any] struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *nullDefaultAppender[T]) AppendDefault(fieldData storage.FieldData, defaultVal T, rowNum int) error {
|
||||||
|
values := make([]T, rowNum)
|
||||||
|
if fieldData.GetNullable() {
|
||||||
|
validData := make([]bool, rowNum)
|
||||||
|
for i := 0; i < rowNum; i++ {
|
||||||
|
validData[i] = true // all true
|
||||||
|
values[i] = defaultVal // fill with default value
|
||||||
|
}
|
||||||
|
return fieldData.AppendRows(values, validData)
|
||||||
|
} else {
|
||||||
|
for i := 0; i < rowNum; i++ {
|
||||||
|
values[i] = defaultVal // fill with default value
|
||||||
|
}
|
||||||
|
return fieldData.AppendDataRows(values)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *nullDefaultAppender[T]) AppendNull(fieldData storage.FieldData, rowNum int) error {
|
||||||
|
if fieldData.GetNullable() {
|
||||||
|
values := make([]T, rowNum)
|
||||||
|
validData := make([]bool, rowNum)
|
||||||
|
for i := 0; i < rowNum; i++ {
|
||||||
|
validData[i] = false
|
||||||
|
}
|
||||||
|
return fieldData.AppendRows(values, validData)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsFillableField(field *schemapb.FieldSchema) bool {
|
||||||
|
nullable := field.GetNullable()
|
||||||
|
defaultVal := field.GetDefaultValue()
|
||||||
|
return nullable || defaultVal != nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func AppendNullableDefaultFieldsData(schema *schemapb.CollectionSchema, data *storage.InsertData, rowNum int) error {
|
||||||
|
for _, field := range schema.GetFields() {
|
||||||
|
if !IsFillableField(field) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if tempData, ok := data.Data[field.GetFieldID()]; ok {
|
||||||
|
if tempData.RowNum() > 0 {
|
||||||
|
continue // values have been read from data file
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add a new column and fill with null or default
|
||||||
|
dataType := field.GetDataType()
|
||||||
|
fieldData, err := storage.NewFieldData(dataType, field, rowNum)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data.Data[field.GetFieldID()] = fieldData
|
||||||
|
|
||||||
|
nullable := field.GetNullable()
|
||||||
|
defaultVal := field.GetDefaultValue()
|
||||||
|
|
||||||
|
// bool/int8/int16/int32/int64/float/double/varchar/json/array can be null value
|
||||||
|
// bool/int8/int16/int32/int64/float/double/varchar can be default value
|
||||||
|
switch dataType {
|
||||||
|
case schemapb.DataType_Bool:
|
||||||
|
appender := &nullDefaultAppender[bool]{}
|
||||||
|
if defaultVal != nil {
|
||||||
|
v := defaultVal.GetBoolData()
|
||||||
|
err = appender.AppendDefault(fieldData, v, rowNum)
|
||||||
|
} else if nullable {
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Int8:
|
||||||
|
appender := &nullDefaultAppender[int8]{}
|
||||||
|
if defaultVal != nil {
|
||||||
|
v := defaultVal.GetIntData()
|
||||||
|
err = appender.AppendDefault(fieldData, int8(v), rowNum)
|
||||||
|
} else if nullable {
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Int16:
|
||||||
|
appender := &nullDefaultAppender[int16]{}
|
||||||
|
if defaultVal != nil {
|
||||||
|
v := defaultVal.GetIntData()
|
||||||
|
err = appender.AppendDefault(fieldData, int16(v), rowNum)
|
||||||
|
} else if nullable {
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Int32:
|
||||||
|
appender := &nullDefaultAppender[int32]{}
|
||||||
|
if defaultVal != nil {
|
||||||
|
v := defaultVal.GetIntData()
|
||||||
|
err = appender.AppendDefault(fieldData, int32(v), rowNum)
|
||||||
|
} else if nullable {
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Int64:
|
||||||
|
appender := &nullDefaultAppender[int64]{}
|
||||||
|
if defaultVal != nil {
|
||||||
|
v := defaultVal.GetLongData()
|
||||||
|
err = appender.AppendDefault(fieldData, v, rowNum)
|
||||||
|
} else if nullable {
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Float:
|
||||||
|
appender := &nullDefaultAppender[float32]{}
|
||||||
|
if defaultVal != nil {
|
||||||
|
v := defaultVal.GetFloatData()
|
||||||
|
err = appender.AppendDefault(fieldData, v, rowNum)
|
||||||
|
} else if nullable {
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Double:
|
||||||
|
appender := &nullDefaultAppender[float64]{}
|
||||||
|
if defaultVal != nil {
|
||||||
|
v := defaultVal.GetDoubleData()
|
||||||
|
err = appender.AppendDefault(fieldData, v, rowNum)
|
||||||
|
} else if nullable {
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_VarChar:
|
||||||
|
appender := &nullDefaultAppender[string]{}
|
||||||
|
if defaultVal != nil {
|
||||||
|
v := defaultVal.GetStringData()
|
||||||
|
err = appender.AppendDefault(fieldData, v, rowNum)
|
||||||
|
} else if nullable {
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_JSON:
|
||||||
|
if nullable {
|
||||||
|
appender := &nullDefaultAppender[[]byte]{}
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Array:
|
||||||
|
if nullable {
|
||||||
|
appender := &nullDefaultAppender[*schemapb.ScalarField]{}
|
||||||
|
err = appender.AppendNull(fieldData, rowNum)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("Unexpected data type: %d, cannot be filled with default value", dataType)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func RunEmbeddingFunction(task *ImportTask, data *storage.InsertData) error {
|
func RunEmbeddingFunction(task *ImportTask, data *storage.InsertData) error {
|
||||||
if err := RunBm25Function(task, data); err != nil {
|
if err := RunBm25Function(task, data); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -275,19 +417,34 @@ func RunBm25Function(task *ImportTask, data *storage.InsertData) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetInsertDataRowCount(data *storage.InsertData, schema *schemapb.CollectionSchema) int {
|
func CanBeZeroRowField(field *schemapb.FieldSchema) bool {
|
||||||
|
if field.GetIsPrimaryKey() && field.GetAutoID() {
|
||||||
|
return true // auto-generated primary key, the row count must be 0
|
||||||
|
}
|
||||||
|
if field.GetIsDynamic() {
|
||||||
|
return true // dyanmic field, row count could be 0
|
||||||
|
}
|
||||||
|
if IsFillableField(field) {
|
||||||
|
return true // nullable/default_value field can be automatically filled if the file doesn't contain this column
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetInsertDataRowCount(data *storage.InsertData, schema *schemapb.CollectionSchema) (int, int64) {
|
||||||
fields := lo.KeyBy(schema.GetFields(), func(field *schemapb.FieldSchema) int64 {
|
fields := lo.KeyBy(schema.GetFields(), func(field *schemapb.FieldSchema) int64 {
|
||||||
return field.GetFieldID()
|
return field.GetFieldID()
|
||||||
})
|
})
|
||||||
for fieldID, fd := range data.Data {
|
for fieldID, fd := range data.Data {
|
||||||
if fields[fieldID].GetIsDynamic() {
|
if fd.RowNum() == 0 && CanBeZeroRowField(fields[fieldID]) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// each collection must contains at least one vector field, there must be one field that row number is not 0
|
||||||
if fd.RowNum() != 0 {
|
if fd.RowNum() != 0 {
|
||||||
return fd.RowNum()
|
return fd.RowNum(), fieldID
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0
|
return 0, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func LogStats(manager TaskManager) {
|
func LogStats(manager TaskManager) {
|
||||||
|
|||||||
@ -17,6 +17,7 @@
|
|||||||
package importv2
|
package importv2
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
@ -24,6 +25,7 @@ import (
|
|||||||
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
||||||
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||||
"github.com/milvus-io/milvus/internal/allocator"
|
"github.com/milvus-io/milvus/internal/allocator"
|
||||||
|
"github.com/milvus-io/milvus/internal/storage"
|
||||||
"github.com/milvus-io/milvus/internal/util/testutil"
|
"github.com/milvus-io/milvus/internal/util/testutil"
|
||||||
"github.com/milvus-io/milvus/pkg/v2/common"
|
"github.com/milvus-io/milvus/pkg/v2/common"
|
||||||
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
||||||
@ -71,7 +73,7 @@ func Test_AppendSystemFieldsData(t *testing.T) {
|
|||||||
assert.Equal(t, 0, insertData.Data[pkField.GetFieldID()].RowNum())
|
assert.Equal(t, 0, insertData.Data[pkField.GetFieldID()].RowNum())
|
||||||
assert.Nil(t, insertData.Data[common.RowIDField])
|
assert.Nil(t, insertData.Data[common.RowIDField])
|
||||||
assert.Nil(t, insertData.Data[common.TimeStampField])
|
assert.Nil(t, insertData.Data[common.TimeStampField])
|
||||||
rowNum := GetInsertDataRowCount(insertData, task.GetSchema())
|
rowNum, _ := GetInsertDataRowCount(insertData, task.GetSchema())
|
||||||
err = AppendSystemFieldsData(task, insertData, rowNum)
|
err = AppendSystemFieldsData(task, insertData, rowNum)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, count, insertData.Data[pkField.GetFieldID()].RowNum())
|
assert.Equal(t, count, insertData.Data[pkField.GetFieldID()].RowNum())
|
||||||
@ -85,7 +87,7 @@ func Test_AppendSystemFieldsData(t *testing.T) {
|
|||||||
assert.Equal(t, 0, insertData.Data[pkField.GetFieldID()].RowNum())
|
assert.Equal(t, 0, insertData.Data[pkField.GetFieldID()].RowNum())
|
||||||
assert.Nil(t, insertData.Data[common.RowIDField])
|
assert.Nil(t, insertData.Data[common.RowIDField])
|
||||||
assert.Nil(t, insertData.Data[common.TimeStampField])
|
assert.Nil(t, insertData.Data[common.TimeStampField])
|
||||||
rowNum = GetInsertDataRowCount(insertData, task.GetSchema())
|
rowNum, _ = GetInsertDataRowCount(insertData, task.GetSchema())
|
||||||
err = AppendSystemFieldsData(task, insertData, rowNum)
|
err = AppendSystemFieldsData(task, insertData, rowNum)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, count, insertData.Data[pkField.GetFieldID()].RowNum())
|
assert.Equal(t, count, insertData.Data[pkField.GetFieldID()].RowNum())
|
||||||
@ -175,3 +177,283 @@ func Test_PickSegment(t *testing.T) {
|
|||||||
_, err := PickSegment(task.req.GetRequestSegments(), "ch-2", 20)
|
_, err := PickSegment(task.req.GetRequestSegments(), "ch-2", 20)
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Test_AppendNullableDefaultFieldsData(t *testing.T) {
|
||||||
|
buildSchemaFn := func() *schemapb.CollectionSchema {
|
||||||
|
fields := make([]*schemapb.FieldSchema, 0)
|
||||||
|
fields = append(fields, &schemapb.FieldSchema{
|
||||||
|
FieldID: 100,
|
||||||
|
Name: "pk",
|
||||||
|
DataType: schemapb.DataType_Int64,
|
||||||
|
IsPrimaryKey: true,
|
||||||
|
AutoID: false,
|
||||||
|
})
|
||||||
|
fields = append(fields, &schemapb.FieldSchema{
|
||||||
|
FieldID: 101,
|
||||||
|
Name: "vec",
|
||||||
|
DataType: schemapb.DataType_FloatVector,
|
||||||
|
TypeParams: []*commonpb.KeyValuePair{
|
||||||
|
{
|
||||||
|
Key: common.DimKey,
|
||||||
|
Value: "4",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
fields = append(fields, &schemapb.FieldSchema{
|
||||||
|
FieldID: 102,
|
||||||
|
Name: "dummy",
|
||||||
|
DataType: schemapb.DataType_Int32,
|
||||||
|
Nullable: true,
|
||||||
|
})
|
||||||
|
|
||||||
|
return &schemapb.CollectionSchema{
|
||||||
|
Fields: fields,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const count = 10
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
fieldID int64
|
||||||
|
dataType schemapb.DataType
|
||||||
|
nullable bool
|
||||||
|
defaultVal *schemapb.ValueField
|
||||||
|
}{
|
||||||
|
// nullable tests
|
||||||
|
{
|
||||||
|
name: "bool is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Bool,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "int8 is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Int8,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "int16 is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Int16,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "int32 is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Int32,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "int64 is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Int64,
|
||||||
|
nullable: true,
|
||||||
|
defaultVal: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "float is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Float,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "double is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Double,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "varchar is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_VarChar,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "json is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_JSON,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "array is nullable",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Array,
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
|
||||||
|
// default value tests
|
||||||
|
{
|
||||||
|
name: "bool is default",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Bool,
|
||||||
|
defaultVal: &schemapb.ValueField{
|
||||||
|
Data: &schemapb.ValueField_BoolData{
|
||||||
|
BoolData: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "int8 is default",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Int8,
|
||||||
|
defaultVal: &schemapb.ValueField{
|
||||||
|
Data: &schemapb.ValueField_IntData{
|
||||||
|
IntData: 99,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "int16 is default",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Int16,
|
||||||
|
defaultVal: &schemapb.ValueField{
|
||||||
|
Data: &schemapb.ValueField_IntData{
|
||||||
|
IntData: 99,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "int32 is default",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Int32,
|
||||||
|
defaultVal: &schemapb.ValueField{
|
||||||
|
Data: &schemapb.ValueField_IntData{
|
||||||
|
IntData: 99,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "int64 is default",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Int64,
|
||||||
|
nullable: true,
|
||||||
|
defaultVal: &schemapb.ValueField{
|
||||||
|
Data: &schemapb.ValueField_LongData{
|
||||||
|
LongData: 99,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "float is default",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Float,
|
||||||
|
defaultVal: &schemapb.ValueField{
|
||||||
|
Data: &schemapb.ValueField_FloatData{
|
||||||
|
FloatData: 99.99,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "double is default",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_Double,
|
||||||
|
defaultVal: &schemapb.ValueField{
|
||||||
|
Data: &schemapb.ValueField_DoubleData{
|
||||||
|
DoubleData: 99.99,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "varchar is default",
|
||||||
|
fieldID: 200,
|
||||||
|
dataType: schemapb.DataType_VarChar,
|
||||||
|
defaultVal: &schemapb.ValueField{
|
||||||
|
Data: &schemapb.ValueField_StringData{
|
||||||
|
StringData: "hello world",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
schema := buildSchemaFn()
|
||||||
|
fieldSchema := &schemapb.FieldSchema{
|
||||||
|
FieldID: tt.fieldID,
|
||||||
|
Name: fmt.Sprintf("field_%d", tt.fieldID),
|
||||||
|
DataType: tt.dataType,
|
||||||
|
Nullable: tt.nullable,
|
||||||
|
DefaultValue: tt.defaultVal,
|
||||||
|
}
|
||||||
|
if tt.dataType == schemapb.DataType_Array {
|
||||||
|
fieldSchema.ElementType = schemapb.DataType_Int64
|
||||||
|
fieldSchema.TypeParams = append(fieldSchema.TypeParams, &commonpb.KeyValuePair{Key: common.MaxCapacityKey, Value: "100"})
|
||||||
|
} else if tt.dataType == schemapb.DataType_VarChar {
|
||||||
|
fieldSchema.TypeParams = append(fieldSchema.TypeParams, &commonpb.KeyValuePair{Key: common.MaxLengthKey, Value: "100"})
|
||||||
|
}
|
||||||
|
|
||||||
|
insertData, err := testutil.CreateInsertData(schema, count)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
schema.Fields = append(schema.Fields, fieldSchema)
|
||||||
|
|
||||||
|
fieldData, err := storage.NewFieldData(fieldSchema.GetDataType(), fieldSchema, 0)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
insertData.Data[fieldSchema.GetFieldID()] = fieldData
|
||||||
|
|
||||||
|
err = AppendNullableDefaultFieldsData(schema, insertData, count)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
for fieldID, fieldData := range insertData.Data {
|
||||||
|
if fieldID < int64(200) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
assert.Equal(t, count, fieldData.RowNum())
|
||||||
|
|
||||||
|
if tt.nullable {
|
||||||
|
assert.True(t, fieldData.GetNullable())
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.defaultVal != nil {
|
||||||
|
switch tt.dataType {
|
||||||
|
case schemapb.DataType_Bool:
|
||||||
|
tempFieldData := fieldData.(*storage.BoolFieldData)
|
||||||
|
for _, v := range tempFieldData.Data {
|
||||||
|
assert.True(t, v)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Int8:
|
||||||
|
tempFieldData := fieldData.(*storage.Int8FieldData)
|
||||||
|
for _, v := range tempFieldData.Data {
|
||||||
|
assert.Equal(t, int8(99), v)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Int16:
|
||||||
|
tempFieldData := fieldData.(*storage.Int16FieldData)
|
||||||
|
for _, v := range tempFieldData.Data {
|
||||||
|
assert.Equal(t, int16(99), v)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Int32:
|
||||||
|
tempFieldData := fieldData.(*storage.Int32FieldData)
|
||||||
|
for _, v := range tempFieldData.Data {
|
||||||
|
assert.Equal(t, int32(99), v)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Int64:
|
||||||
|
tempFieldData := fieldData.(*storage.Int64FieldData)
|
||||||
|
for _, v := range tempFieldData.Data {
|
||||||
|
assert.Equal(t, int64(99), v)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Float:
|
||||||
|
tempFieldData := fieldData.(*storage.FloatFieldData)
|
||||||
|
for _, v := range tempFieldData.Data {
|
||||||
|
assert.Equal(t, float32(99.99), v)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_Double:
|
||||||
|
tempFieldData := fieldData.(*storage.DoubleFieldData)
|
||||||
|
for _, v := range tempFieldData.Data {
|
||||||
|
assert.Equal(t, float64(99.99), v)
|
||||||
|
}
|
||||||
|
case schemapb.DataType_VarChar:
|
||||||
|
tempFieldData := fieldData.(*storage.StringFieldData)
|
||||||
|
for _, v := range tempFieldData.Data {
|
||||||
|
assert.Equal(t, "hello world", v)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
} else if tt.nullable {
|
||||||
|
for i := 0; i < count; i++ {
|
||||||
|
assert.Nil(t, fieldData.GetRow(i))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -67,6 +67,10 @@ func CreateFieldReaders(ctx context.Context, fileReader *pqarrow.FileReader, sch
|
|||||||
return nil, merr.WrapErrImportFailed(
|
return nil, merr.WrapErrImportFailed(
|
||||||
fmt.Sprintf("the primary key '%s' is auto-generated, no need to provide", field.GetName()))
|
fmt.Sprintf("the primary key '%s' is auto-generated, no need to provide", field.GetName()))
|
||||||
}
|
}
|
||||||
|
if field.GetIsFunctionOutput() {
|
||||||
|
return nil, merr.WrapErrImportFailed(
|
||||||
|
fmt.Sprintf("the field '%s' is output by function, no need to provide", field.GetName()))
|
||||||
|
}
|
||||||
|
|
||||||
cr, err := NewFieldReader(ctx, fileReader, i, field)
|
cr, err := NewFieldReader(ctx, fileReader, i, field)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -80,7 +84,8 @@ func CreateFieldReaders(ctx context.Context, fileReader *pqarrow.FileReader, sch
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, field := range nameToField {
|
for _, field := range nameToField {
|
||||||
if typeutil.IsAutoPKField(field) || field.GetIsDynamic() || field.GetIsFunctionOutput() {
|
if typeutil.IsAutoPKField(field) || field.GetIsDynamic() || field.GetIsFunctionOutput() ||
|
||||||
|
field.GetNullable() || field.GetDefaultValue() != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, ok := crs[field.GetFieldID()]; !ok {
|
if _, ok := crs[field.GetFieldID()]; !ok {
|
||||||
@ -285,12 +290,17 @@ func isSchemaEqual(schema *schemapb.CollectionSchema, arrSchema *arrow.Schema) e
|
|||||||
return field.Name
|
return field.Name
|
||||||
})
|
})
|
||||||
for _, field := range schema.GetFields() {
|
for _, field := range schema.GetFields() {
|
||||||
|
// ignore autoPKField and functionOutputField
|
||||||
if typeutil.IsAutoPKField(field) || field.GetIsFunctionOutput() {
|
if typeutil.IsAutoPKField(field) || field.GetIsFunctionOutput() {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
arrField, ok := arrNameToField[field.GetName()]
|
arrField, ok := arrNameToField[field.GetName()]
|
||||||
if !ok {
|
if !ok {
|
||||||
if field.GetIsDynamic() {
|
// Special fields no need to provide in data files, the parquet file doesn't contain this field, no need to compare
|
||||||
|
// 1. dynamic field(name is "$meta"), ignore
|
||||||
|
// 2. nullable field, filled with null values
|
||||||
|
// 3. default value field, filled with default value
|
||||||
|
if field.GetIsDynamic() || field.GetNullable() || field.GetDefaultValue() != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
return merr.WrapErrImportFailed(fmt.Sprintf("field '%s' not in arrow schema", field.GetName()))
|
return merr.WrapErrImportFailed(fmt.Sprintf("field '%s' not in arrow schema", field.GetName()))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user