fix: Prevent retry when importing invalid UTF-8 strings (#45067)

Convert invalid UTF-8 string the hex in failure reason.

issue: https://github.com/milvus-io/milvus/issues/45066

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
yihao.dai 2025-10-27 12:30:06 +08:00 committed by GitHub
parent 8d11373376
commit dabbae0386
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 115 additions and 1 deletions

View File

@ -18,6 +18,8 @@ package common
import (
"fmt"
"strings"
"unicode/utf8"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
@ -55,9 +57,41 @@ func EstimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema
return ret, nil
}
// SafeStringForError safely converts a string for use in error messages.
// It replaces invalid UTF-8 sequences with their hex representation to avoid
// gRPC serialization errors while still providing useful debugging information.
func SafeStringForError(s string) string {
if utf8.ValidString(s) {
return s
}
var result strings.Builder
for i, r := range s {
if r == utf8.RuneError {
// Invalid UTF-8 sequence, encode as hex
result.WriteString(fmt.Sprintf("\\x%02x", s[i]))
} else {
result.WriteRune(r)
}
}
return result.String()
}
// SafeStringForErrorWithLimit safely converts a string for use in error messages
// with a length limit to prevent extremely long error messages.
func SafeStringForErrorWithLimit(s string, maxLen int) string {
safe := SafeStringForError(s)
if len(safe) <= maxLen {
return safe
}
return safe[:maxLen] + "..."
}
func CheckValidUTF8(s string, field *schemapb.FieldSchema) error {
if !typeutil.IsUTF8(s) {
return fmt.Errorf("field %s contains invalid UTF-8 data, value=%s", field.GetName(), s)
// Use safe string representation to avoid gRPC serialization errors
safeValue := SafeStringForErrorWithLimit(s, 100)
return fmt.Errorf("field '%s' contains invalid UTF-8 data, value=%s", field.GetName(), safeValue)
}
return nil
}

View File

@ -17,7 +17,9 @@
package common
import (
"strings"
"testing"
"unicode/utf8"
"github.com/stretchr/testify/assert"
@ -181,3 +183,81 @@ func TestUtil_CheckValidString(t *testing.T) {
err = CheckValidString("aaaaa", 5, fieldSchema)
assert.NoError(t, err)
}
func TestUtil_SafeStringForError(t *testing.T) {
// Test valid UTF-8 string
validStr := "Hello, 世界!"
result := SafeStringForError(validStr)
assert.Equal(t, validStr, result)
// Test invalid UTF-8 string
invalidStr := string([]byte{0xC0, 0xAF, 'a', 'b', 'c'})
result = SafeStringForError(invalidStr)
assert.Contains(t, result, "\\xc0")
assert.Contains(t, result, "\\xaf")
assert.Contains(t, result, "abc")
// Test empty string
result = SafeStringForError("")
assert.Equal(t, "", result)
// Test string with mixed valid and invalid UTF-8
mixedStr := "valid" + string([]byte{0xFF, 0xFE}) + "text"
result = SafeStringForError(mixedStr)
assert.Contains(t, result, "valid")
assert.Contains(t, result, "\\xff")
assert.Contains(t, result, "\\xfe")
assert.Contains(t, result, "text")
}
func TestUtil_SafeStringForErrorWithLimit(t *testing.T) {
// Test string within limit
shortStr := "short"
result := SafeStringForErrorWithLimit(shortStr, 10)
assert.Equal(t, shortStr, result)
// Test string exceeding limit
longStr := "this is a very long string that exceeds the limit"
result = SafeStringForErrorWithLimit(longStr, 20)
assert.Equal(t, 23, len(result)) // 20 chars + "..."
assert.True(t, strings.HasSuffix(result, "..."))
// Test invalid UTF-8 string with limit
invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'})
result = SafeStringForErrorWithLimit(invalidStr, 15)
assert.True(t, len(result) <= 18) // 15 chars + "..."
assert.True(t, strings.HasSuffix(result, "..."))
}
func TestUtil_CheckValidUTF8_WithSafeError(t *testing.T) {
fieldSchema := &schemapb.FieldSchema{
FieldID: 1,
Name: "test_field",
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.MaxLengthKey,
Value: "1000",
},
},
}
// Test with invalid UTF-8 - should not cause gRPC serialization error
invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE})
err := CheckValidUTF8(invalidStr, fieldSchema)
assert.Error(t, err)
// Verify the error message contains safe representation
errMsg := err.Error()
assert.Contains(t, errMsg, "test_field")
assert.Contains(t, errMsg, "invalid UTF-8 data")
assert.Contains(t, errMsg, "\\xc0") // Should contain hex representation
assert.Contains(t, errMsg, "\\xaf")
// Verify the error message is valid UTF-8 itself
assert.True(t, utf8.ValidString(errMsg), "Error message should be valid UTF-8")
// Test with valid UTF-8
err = CheckValidUTF8("valid string", fieldSchema)
assert.NoError(t, err)
}