yihao.dai dabbae0386
fix: Prevent retry when importing invalid UTF-8 strings (#45067)
Convert invalid UTF-8 string the hex in failure reason.

issue: https://github.com/milvus-io/milvus/issues/45066

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
2025-10-27 12:30:06 +08:00

264 lines
7.2 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package common
import (
"strings"
"testing"
"unicode/utf8"
"github.com/stretchr/testify/assert"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/common"
)
func TestUtil_EstimateReadCountPerBatch(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{
FieldID: 100,
Name: "pk",
IsPrimaryKey: true,
DataType: schemapb.DataType_Int64,
},
{
FieldID: 101,
Name: "vec",
DataType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.DimKey,
Value: "128",
},
},
},
},
}
count, err := EstimateReadCountPerBatch(16*1024*1024, schema)
assert.NoError(t, err)
assert.Equal(t, int64(1000), count)
schema.Fields = append(schema.Fields, &schemapb.FieldSchema{
FieldID: 102,
Name: "vec2",
DataType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.DimKey,
Value: "invalidDim",
},
},
})
_, err = EstimateReadCountPerBatch(16*1024*1024, schema)
assert.Error(t, err)
}
func TestUtil_EstimateReadCountPerBatch_InvalidBufferSize(t *testing.T) {
schema := &schemapb.CollectionSchema{}
count, err := EstimateReadCountPerBatch(16*1024*1024, schema)
assert.Error(t, err)
assert.Equal(t, int64(0), count)
t.Logf("err=%v", err)
schema = &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{
FieldID: 100,
DataType: schemapb.DataType_Int64,
},
},
}
count, err = EstimateReadCountPerBatch(0, schema)
assert.Error(t, err)
assert.Equal(t, int64(0), count)
t.Logf("err=%v", err)
}
func TestUtil_EstimateReadCountPerBatch_LargeSchema(t *testing.T) {
schema := &schemapb.CollectionSchema{}
for i := 0; i < 100; i++ {
schema.Fields = append(schema.Fields, &schemapb.FieldSchema{
FieldID: int64(i),
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.MaxLengthKey,
Value: "10000000",
},
},
})
}
count, err := EstimateReadCountPerBatch(16*1024*1024, schema)
assert.NoError(t, err)
assert.Equal(t, int64(1), count)
}
func TestUtil_CheckVarcharLength(t *testing.T) {
fieldSchema := &schemapb.FieldSchema{
FieldID: 1,
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.MaxLengthKey,
Value: "5",
},
},
}
err := CheckVarcharLength("aaaaaaaa", 5, fieldSchema)
assert.Error(t, err)
err = CheckVarcharLength("aaaaa", 5, fieldSchema)
assert.NoError(t, err)
}
func TestUtil_CheckArrayCapacity(t *testing.T) {
fieldSchema := &schemapb.FieldSchema{
FieldID: 1,
DataType: schemapb.DataType_Array,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.MaxCapacityKey,
Value: "5",
},
},
}
err := CheckArrayCapacity(6, 5, fieldSchema)
assert.Error(t, err)
err = CheckArrayCapacity(5, 5, fieldSchema)
assert.NoError(t, err)
}
func TestUtil_CheckValidUTF8(t *testing.T) {
fieldSchema := &schemapb.FieldSchema{
FieldID: 1,
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.MaxLengthKey,
Value: "1000",
},
},
}
err := CheckValidUTF8(string([]byte{0xC0, 0xAF}), fieldSchema)
assert.Error(t, err)
err = CheckValidUTF8("abc", fieldSchema)
assert.NoError(t, err)
}
func TestUtil_CheckValidString(t *testing.T) {
fieldSchema := &schemapb.FieldSchema{
FieldID: 1,
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.MaxLengthKey,
Value: "5",
},
},
}
err := CheckValidString("aaaaaaaa", 5, fieldSchema)
assert.Error(t, err)
err = CheckValidString(string([]byte{0xC0, 0xAF}), 5, fieldSchema)
assert.Error(t, err)
err = CheckValidString("aaaaa", 5, fieldSchema)
assert.NoError(t, err)
}
func TestUtil_SafeStringForError(t *testing.T) {
// Test valid UTF-8 string
validStr := "Hello, 世界!"
result := SafeStringForError(validStr)
assert.Equal(t, validStr, result)
// Test invalid UTF-8 string
invalidStr := string([]byte{0xC0, 0xAF, 'a', 'b', 'c'})
result = SafeStringForError(invalidStr)
assert.Contains(t, result, "\\xc0")
assert.Contains(t, result, "\\xaf")
assert.Contains(t, result, "abc")
// Test empty string
result = SafeStringForError("")
assert.Equal(t, "", result)
// Test string with mixed valid and invalid UTF-8
mixedStr := "valid" + string([]byte{0xFF, 0xFE}) + "text"
result = SafeStringForError(mixedStr)
assert.Contains(t, result, "valid")
assert.Contains(t, result, "\\xff")
assert.Contains(t, result, "\\xfe")
assert.Contains(t, result, "text")
}
func TestUtil_SafeStringForErrorWithLimit(t *testing.T) {
// Test string within limit
shortStr := "short"
result := SafeStringForErrorWithLimit(shortStr, 10)
assert.Equal(t, shortStr, result)
// Test string exceeding limit
longStr := "this is a very long string that exceeds the limit"
result = SafeStringForErrorWithLimit(longStr, 20)
assert.Equal(t, 23, len(result)) // 20 chars + "..."
assert.True(t, strings.HasSuffix(result, "..."))
// Test invalid UTF-8 string with limit
invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'})
result = SafeStringForErrorWithLimit(invalidStr, 15)
assert.True(t, len(result) <= 18) // 15 chars + "..."
assert.True(t, strings.HasSuffix(result, "..."))
}
func TestUtil_CheckValidUTF8_WithSafeError(t *testing.T) {
fieldSchema := &schemapb.FieldSchema{
FieldID: 1,
Name: "test_field",
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.MaxLengthKey,
Value: "1000",
},
},
}
// Test with invalid UTF-8 - should not cause gRPC serialization error
invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE})
err := CheckValidUTF8(invalidStr, fieldSchema)
assert.Error(t, err)
// Verify the error message contains safe representation
errMsg := err.Error()
assert.Contains(t, errMsg, "test_field")
assert.Contains(t, errMsg, "invalid UTF-8 data")
assert.Contains(t, errMsg, "\\xc0") // Should contain hex representation
assert.Contains(t, errMsg, "\\xaf")
// Verify the error message is valid UTF-8 itself
assert.True(t, utf8.ValidString(errMsg), "Error message should be valid UTF-8")
// Test with valid UTF-8
err = CheckValidUTF8("valid string", fieldSchema)
assert.NoError(t, err)
}