mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 17:48:29 +08:00
fix: Prevent retry when importing invalid UTF-8 strings (#45067)
Convert invalid UTF-8 string the hex in failure reason. issue: https://github.com/milvus-io/milvus/issues/45066 Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
parent
8d11373376
commit
dabbae0386
@ -18,6 +18,8 @@ package common
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||||
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
||||||
@ -55,9 +57,41 @@ func EstimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema
|
|||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SafeStringForError safely converts a string for use in error messages.
|
||||||
|
// It replaces invalid UTF-8 sequences with their hex representation to avoid
|
||||||
|
// gRPC serialization errors while still providing useful debugging information.
|
||||||
|
func SafeStringForError(s string) string {
|
||||||
|
if utf8.ValidString(s) {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
var result strings.Builder
|
||||||
|
for i, r := range s {
|
||||||
|
if r == utf8.RuneError {
|
||||||
|
// Invalid UTF-8 sequence, encode as hex
|
||||||
|
result.WriteString(fmt.Sprintf("\\x%02x", s[i]))
|
||||||
|
} else {
|
||||||
|
result.WriteRune(r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// SafeStringForErrorWithLimit safely converts a string for use in error messages
|
||||||
|
// with a length limit to prevent extremely long error messages.
|
||||||
|
func SafeStringForErrorWithLimit(s string, maxLen int) string {
|
||||||
|
safe := SafeStringForError(s)
|
||||||
|
if len(safe) <= maxLen {
|
||||||
|
return safe
|
||||||
|
}
|
||||||
|
return safe[:maxLen] + "..."
|
||||||
|
}
|
||||||
|
|
||||||
func CheckValidUTF8(s string, field *schemapb.FieldSchema) error {
|
func CheckValidUTF8(s string, field *schemapb.FieldSchema) error {
|
||||||
if !typeutil.IsUTF8(s) {
|
if !typeutil.IsUTF8(s) {
|
||||||
return fmt.Errorf("field %s contains invalid UTF-8 data, value=%s", field.GetName(), s)
|
// Use safe string representation to avoid gRPC serialization errors
|
||||||
|
safeValue := SafeStringForErrorWithLimit(s, 100)
|
||||||
|
return fmt.Errorf("field '%s' contains invalid UTF-8 data, value=%s", field.GetName(), safeValue)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@ -17,7 +17,9 @@
|
|||||||
package common
|
package common
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
|
||||||
@ -181,3 +183,81 @@ func TestUtil_CheckValidString(t *testing.T) {
|
|||||||
err = CheckValidString("aaaaa", 5, fieldSchema)
|
err = CheckValidString("aaaaa", 5, fieldSchema)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestUtil_SafeStringForError(t *testing.T) {
|
||||||
|
// Test valid UTF-8 string
|
||||||
|
validStr := "Hello, 世界!"
|
||||||
|
result := SafeStringForError(validStr)
|
||||||
|
assert.Equal(t, validStr, result)
|
||||||
|
|
||||||
|
// Test invalid UTF-8 string
|
||||||
|
invalidStr := string([]byte{0xC0, 0xAF, 'a', 'b', 'c'})
|
||||||
|
result = SafeStringForError(invalidStr)
|
||||||
|
assert.Contains(t, result, "\\xc0")
|
||||||
|
assert.Contains(t, result, "\\xaf")
|
||||||
|
assert.Contains(t, result, "abc")
|
||||||
|
|
||||||
|
// Test empty string
|
||||||
|
result = SafeStringForError("")
|
||||||
|
assert.Equal(t, "", result)
|
||||||
|
|
||||||
|
// Test string with mixed valid and invalid UTF-8
|
||||||
|
mixedStr := "valid" + string([]byte{0xFF, 0xFE}) + "text"
|
||||||
|
result = SafeStringForError(mixedStr)
|
||||||
|
assert.Contains(t, result, "valid")
|
||||||
|
assert.Contains(t, result, "\\xff")
|
||||||
|
assert.Contains(t, result, "\\xfe")
|
||||||
|
assert.Contains(t, result, "text")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUtil_SafeStringForErrorWithLimit(t *testing.T) {
|
||||||
|
// Test string within limit
|
||||||
|
shortStr := "short"
|
||||||
|
result := SafeStringForErrorWithLimit(shortStr, 10)
|
||||||
|
assert.Equal(t, shortStr, result)
|
||||||
|
|
||||||
|
// Test string exceeding limit
|
||||||
|
longStr := "this is a very long string that exceeds the limit"
|
||||||
|
result = SafeStringForErrorWithLimit(longStr, 20)
|
||||||
|
assert.Equal(t, 23, len(result)) // 20 chars + "..."
|
||||||
|
assert.True(t, strings.HasSuffix(result, "..."))
|
||||||
|
|
||||||
|
// Test invalid UTF-8 string with limit
|
||||||
|
invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'})
|
||||||
|
result = SafeStringForErrorWithLimit(invalidStr, 15)
|
||||||
|
assert.True(t, len(result) <= 18) // 15 chars + "..."
|
||||||
|
assert.True(t, strings.HasSuffix(result, "..."))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUtil_CheckValidUTF8_WithSafeError(t *testing.T) {
|
||||||
|
fieldSchema := &schemapb.FieldSchema{
|
||||||
|
FieldID: 1,
|
||||||
|
Name: "test_field",
|
||||||
|
DataType: schemapb.DataType_VarChar,
|
||||||
|
TypeParams: []*commonpb.KeyValuePair{
|
||||||
|
{
|
||||||
|
Key: common.MaxLengthKey,
|
||||||
|
Value: "1000",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test with invalid UTF-8 - should not cause gRPC serialization error
|
||||||
|
invalidStr := string([]byte{0xC0, 0xAF, 0xFF, 0xFE})
|
||||||
|
err := CheckValidUTF8(invalidStr, fieldSchema)
|
||||||
|
assert.Error(t, err)
|
||||||
|
|
||||||
|
// Verify the error message contains safe representation
|
||||||
|
errMsg := err.Error()
|
||||||
|
assert.Contains(t, errMsg, "test_field")
|
||||||
|
assert.Contains(t, errMsg, "invalid UTF-8 data")
|
||||||
|
assert.Contains(t, errMsg, "\\xc0") // Should contain hex representation
|
||||||
|
assert.Contains(t, errMsg, "\\xaf")
|
||||||
|
|
||||||
|
// Verify the error message is valid UTF-8 itself
|
||||||
|
assert.True(t, utf8.ValidString(errMsg), "Error message should be valid UTF-8")
|
||||||
|
|
||||||
|
// Test with valid UTF-8
|
||||||
|
err = CheckValidUTF8("valid string", fieldSchema)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user