From dabbae0386a1434c895ebe27b4b2385fe0dec982 Mon Sep 17 00:00:00 2001 From: "yihao.dai" Date: Mon, 27 Oct 2025 12:30:06 +0800 Subject: [PATCH] fix: Prevent retry when importing invalid UTF-8 strings (#45067) Convert invalid UTF-8 string the hex in failure reason. issue: https://github.com/milvus-io/milvus/issues/45066 Signed-off-by: bigsheeper --- internal/util/importutilv2/common/util.go | 36 ++++++++- .../util/importutilv2/common/util_test.go | 80 +++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) diff --git a/internal/util/importutilv2/common/util.go b/internal/util/importutilv2/common/util.go index a29a4f372b..5edac6aa2c 100644 --- a/internal/util/importutilv2/common/util.go +++ b/internal/util/importutilv2/common/util.go @@ -18,6 +18,8 @@ package common import ( "fmt" + "strings" + "unicode/utf8" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/pkg/v2/util/typeutil" @@ -55,9 +57,41 @@ func EstimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema return ret, nil } +// SafeStringForError safely converts a string for use in error messages. +// It replaces invalid UTF-8 sequences with their hex representation to avoid +// gRPC serialization errors while still providing useful debugging information. +func SafeStringForError(s string) string { + if utf8.ValidString(s) { + return s + } + + var result strings.Builder + for i, r := range s { + if r == utf8.RuneError { + // Invalid UTF-8 sequence, encode as hex + result.WriteString(fmt.Sprintf("\\x%02x", s[i])) + } else { + result.WriteRune(r) + } + } + return result.String() +} + +// SafeStringForErrorWithLimit safely converts a string for use in error messages +// with a length limit to prevent extremely long error messages. +func SafeStringForErrorWithLimit(s string, maxLen int) string { + safe := SafeStringForError(s) + if len(safe) <= maxLen { + return safe + } + return safe[:maxLen] + "..." +} + func CheckValidUTF8(s string, field *schemapb.FieldSchema) error { if !typeutil.IsUTF8(s) { - return fmt.Errorf("field %s contains invalid UTF-8 data, value=%s", field.GetName(), s) + // Use safe string representation to avoid gRPC serialization errors + safeValue := SafeStringForErrorWithLimit(s, 100) + return fmt.Errorf("field '%s' contains invalid UTF-8 data, value=%s", field.GetName(), safeValue) } return nil } diff --git a/internal/util/importutilv2/common/util_test.go b/internal/util/importutilv2/common/util_test.go index c9febeb113..53dfc278f4 100644 --- a/internal/util/importutilv2/common/util_test.go +++ b/internal/util/importutilv2/common/util_test.go @@ -17,7 +17,9 @@ package common import ( + "strings" "testing" + "unicode/utf8" "github.com/stretchr/testify/assert" @@ -181,3 +183,81 @@ func TestUtil_CheckValidString(t *testing.T) { err = CheckValidString("aaaaa", 5, fieldSchema) assert.NoError(t, err) } + +func TestUtil_SafeStringForError(t *testing.T) { + // Test valid UTF-8 string + validStr := "Hello, 世界!" + result := SafeStringForError(validStr) + assert.Equal(t, validStr, result) + + // Test invalid UTF-8 string + invalidStr := string([]byte{0xC0, 0xAF, 'a', 'b', 'c'}) + result = SafeStringForError(invalidStr) + assert.Contains(t, result, "\\xc0") + assert.Contains(t, result, "\\xaf") + assert.Contains(t, result, "abc") + + // Test empty string + result = SafeStringForError("") + assert.Equal(t, "", result) + + // Test string with mixed valid and invalid UTF-8 + mixedStr := "valid" + string([]byte{0xFF, 0xFE}) + "text" + result = SafeStringForError(mixedStr) + assert.Contains(t, result, "valid") + assert.Contains(t, result, "\\xff") + assert.Contains(t, result, "\\xfe") + assert.Contains(t, result, "text") +} + +func TestUtil_SafeStringForErrorWithLimit(t *testing.T) { + // Test string within limit + shortStr := "short" + result := SafeStringForErrorWithLimit(shortStr, 10) + assert.Equal(t, shortStr, result) + + // Test string exceeding limit + longStr := "this is a very long string that exceeds the limit" + result = SafeStringForErrorWithLimit(longStr, 20) + assert.Equal(t, 23, len(result)) // 20 chars + "..." + assert.True(t, strings.HasSuffix(result, "...")) + + // Test invalid UTF-8 string with limit + invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}) + result = SafeStringForErrorWithLimit(invalidStr, 15) + assert.True(t, len(result) <= 18) // 15 chars + "..." + assert.True(t, strings.HasSuffix(result, "...")) +} + +func TestUtil_CheckValidUTF8_WithSafeError(t *testing.T) { + fieldSchema := &schemapb.FieldSchema{ + FieldID: 1, + Name: "test_field", + DataType: schemapb.DataType_VarChar, + TypeParams: []*commonpb.KeyValuePair{ + { + Key: common.MaxLengthKey, + Value: "1000", + }, + }, + } + + // Test with invalid UTF-8 - should not cause gRPC serialization error + invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE}) + err := CheckValidUTF8(invalidStr, fieldSchema) + assert.Error(t, err) + + // Verify the error message contains safe representation + errMsg := err.Error() + assert.Contains(t, errMsg, "test_field") + assert.Contains(t, errMsg, "invalid UTF-8 data") + assert.Contains(t, errMsg, "\\xc0") // Should contain hex representation + assert.Contains(t, errMsg, "\\xaf") + + // Verify the error message is valid UTF-8 itself + assert.True(t, utf8.ValidString(errMsg), "Error message should be valid UTF-8") + + // Test with valid UTF-8 + err = CheckValidUTF8("valid string", fieldSchema) + assert.NoError(t, err) +}