mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
fix: Prevent retry when importing invalid UTF-8 strings (#45067)
Convert invalid UTF-8 string the hex in failure reason. issue: https://github.com/milvus-io/milvus/issues/45066 Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
parent
8d11373376
commit
dabbae0386
@ -18,6 +18,8 @@ package common
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
||||
@ -55,9 +57,41 @@ func EstimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
// SafeStringForError safely converts a string for use in error messages.
|
||||
// It replaces invalid UTF-8 sequences with their hex representation to avoid
|
||||
// gRPC serialization errors while still providing useful debugging information.
|
||||
func SafeStringForError(s string) string {
|
||||
if utf8.ValidString(s) {
|
||||
return s
|
||||
}
|
||||
|
||||
var result strings.Builder
|
||||
for i, r := range s {
|
||||
if r == utf8.RuneError {
|
||||
// Invalid UTF-8 sequence, encode as hex
|
||||
result.WriteString(fmt.Sprintf("\\x%02x", s[i]))
|
||||
} else {
|
||||
result.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return result.String()
|
||||
}
|
||||
|
||||
// SafeStringForErrorWithLimit safely converts a string for use in error messages
|
||||
// with a length limit to prevent extremely long error messages.
|
||||
func SafeStringForErrorWithLimit(s string, maxLen int) string {
|
||||
safe := SafeStringForError(s)
|
||||
if len(safe) <= maxLen {
|
||||
return safe
|
||||
}
|
||||
return safe[:maxLen] + "..."
|
||||
}
|
||||
|
||||
func CheckValidUTF8(s string, field *schemapb.FieldSchema) error {
|
||||
if !typeutil.IsUTF8(s) {
|
||||
return fmt.Errorf("field %s contains invalid UTF-8 data, value=%s", field.GetName(), s)
|
||||
// Use safe string representation to avoid gRPC serialization errors
|
||||
safeValue := SafeStringForErrorWithLimit(s, 100)
|
||||
return fmt.Errorf("field '%s' contains invalid UTF-8 data, value=%s", field.GetName(), safeValue)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -17,7 +17,9 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
@ -181,3 +183,81 @@ func TestUtil_CheckValidString(t *testing.T) {
|
||||
err = CheckValidString("aaaaa", 5, fieldSchema)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestUtil_SafeStringForError(t *testing.T) {
|
||||
// Test valid UTF-8 string
|
||||
validStr := "Hello, 世界!"
|
||||
result := SafeStringForError(validStr)
|
||||
assert.Equal(t, validStr, result)
|
||||
|
||||
// Test invalid UTF-8 string
|
||||
invalidStr := string([]byte{0xC0, 0xAF, 'a', 'b', 'c'})
|
||||
result = SafeStringForError(invalidStr)
|
||||
assert.Contains(t, result, "\\xc0")
|
||||
assert.Contains(t, result, "\\xaf")
|
||||
assert.Contains(t, result, "abc")
|
||||
|
||||
// Test empty string
|
||||
result = SafeStringForError("")
|
||||
assert.Equal(t, "", result)
|
||||
|
||||
// Test string with mixed valid and invalid UTF-8
|
||||
mixedStr := "valid" + string([]byte{0xFF, 0xFE}) + "text"
|
||||
result = SafeStringForError(mixedStr)
|
||||
assert.Contains(t, result, "valid")
|
||||
assert.Contains(t, result, "\\xff")
|
||||
assert.Contains(t, result, "\\xfe")
|
||||
assert.Contains(t, result, "text")
|
||||
}
|
||||
|
||||
func TestUtil_SafeStringForErrorWithLimit(t *testing.T) {
|
||||
// Test string within limit
|
||||
shortStr := "short"
|
||||
result := SafeStringForErrorWithLimit(shortStr, 10)
|
||||
assert.Equal(t, shortStr, result)
|
||||
|
||||
// Test string exceeding limit
|
||||
longStr := "this is a very long string that exceeds the limit"
|
||||
result = SafeStringForErrorWithLimit(longStr, 20)
|
||||
assert.Equal(t, 23, len(result)) // 20 chars + "..."
|
||||
assert.True(t, strings.HasSuffix(result, "..."))
|
||||
|
||||
// Test invalid UTF-8 string with limit
|
||||
invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'})
|
||||
result = SafeStringForErrorWithLimit(invalidStr, 15)
|
||||
assert.True(t, len(result) <= 18) // 15 chars + "..."
|
||||
assert.True(t, strings.HasSuffix(result, "..."))
|
||||
}
|
||||
|
||||
func TestUtil_CheckValidUTF8_WithSafeError(t *testing.T) {
|
||||
fieldSchema := &schemapb.FieldSchema{
|
||||
FieldID: 1,
|
||||
Name: "test_field",
|
||||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{
|
||||
{
|
||||
Key: common.MaxLengthKey,
|
||||
Value: "1000",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Test with invalid UTF-8 - should not cause gRPC serialization error
|
||||
invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE})
|
||||
err := CheckValidUTF8(invalidStr, fieldSchema)
|
||||
assert.Error(t, err)
|
||||
|
||||
// Verify the error message contains safe representation
|
||||
errMsg := err.Error()
|
||||
assert.Contains(t, errMsg, "test_field")
|
||||
assert.Contains(t, errMsg, "invalid UTF-8 data")
|
||||
assert.Contains(t, errMsg, "\\xc0") // Should contain hex representation
|
||||
assert.Contains(t, errMsg, "\\xaf")
|
||||
|
||||
// Verify the error message is valid UTF-8 itself
|
||||
assert.True(t, utf8.ValidString(errMsg), "Error message should be valid UTF-8")
|
||||
|
||||
// Test with valid UTF-8
|
||||
err = CheckValidUTF8("valid string", fieldSchema)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user