milvus/pkg/util/timestamptz/timestamptz.go
zhenshan.cao a3b8bcb198
fix: correct default value backfill during AddField (#45634)
issue: https://github.com/milvus-io/milvus/issues/44585

Signed-off-by: zhenshan.cao <zhenshan.cao@zilliz.com>
2025-11-18 23:05:42 +08:00

386 lines
14 KiB
Go

package timestamptz
import (
"bytes"
"fmt"
"strings"
"time"
"github.com/cockroachdb/errors"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/util/funcutil"
)
// Define max/min offset boundaries in seconds for validation, exported for external checks if necessary.
const (
MaxOffsetSeconds = 14 * 3600 // +14:00
MinOffsetSeconds = -12 * 3600 // -12:00
)
// NaiveTzLayouts is a list of common timestamp formats that lack timezone information.
var NaiveTzLayouts = []string{
"2006-01-02T15:04:05.999999999",
"2006-01-02T15:04:05",
"2006-01-02 15:04:05.999999999",
"2006-01-02 15:04:05",
}
// ParseTimeTz is the internal core function for parsing TZ-aware or naive timestamps.
// It includes strict validation for the UTC offset range.
func ParseTimeTz(inputStr string, defaultTimezoneStr string) (time.Time, error) {
// 1. Primary parsing: Attempt to parse a TZ-aware string (RFC3339Nano)
t, err := time.Parse(time.RFC3339Nano, inputStr)
if err == nil {
// Parsing succeeded (TZ-aware string). Now, perform the strict offset validation.
// If the string contains an explicit offset (like +99:00), t.Zone() will reflect it.
_, offsetSeconds := t.Zone()
if offsetSeconds > MaxOffsetSeconds || offsetSeconds < MinOffsetSeconds {
offsetHours := offsetSeconds / 3600
return time.Time{}, fmt.Errorf("UTC offset hour %d is out of the valid range [%d, %d]", offsetHours, MinOffsetSeconds/3600, MaxOffsetSeconds/3600)
}
return t, nil
}
loc, err := time.LoadLocation(defaultTimezoneStr)
if err != nil {
return time.Time{}, fmt.Errorf("invalid default timezone string '%s': %w", defaultTimezoneStr, err)
}
// 2. Fallback parsing: Attempt to parse a naive string using NaiveTzLayouts
var parsed bool
for _, layout := range NaiveTzLayouts {
// For naive strings, time.ParseInLocation assigns the default location (loc).
parsedTime, parseErr := time.ParseInLocation(layout, inputStr, loc)
if parseErr == nil {
t = parsedTime
parsed = true
break
}
}
if !parsed {
return time.Time{}, fmt.Errorf("invalid timestamp string: '%s'. Does not match any known format", inputStr)
}
// No offset validation needed here: The time was assigned the safe defaultTimezoneStr (loc),
// which is already validated via time.LoadLocation.
return t, nil
}
// ValidateTimestampTz checks if the timestamp string is valid (TZ-aware or naive + default TZ).
func ValidateTimestampTz(inputStr string, defaultTimezoneStr string) error {
_, err := ParseTimeTz(inputStr, defaultTimezoneStr)
return err
}
// ValidateAndNormalizeTimestampTz validates the timestamp and normalizes it to a TZ-aware RFC3339Nano string.
func ValidateAndNormalizeTimestampTz(inputStr string, defaultTimezoneStr string) (string, error) {
t, err := ParseTimeTz(inputStr, defaultTimezoneStr)
if err != nil {
return "", err
}
// Normalization: Format the time object to include the timezone offset.
return t.Format(time.RFC3339Nano), nil
}
// ValidateAndReturnUnixMicroTz validates the timestamp and returns its Unix microsecond (int64) representation.
func ValidateAndReturnUnixMicroTz(inputStr string, defaultTimezoneStr string) (int64, error) {
t, err := ParseTimeTz(inputStr, defaultTimezoneStr)
if err != nil {
return 0, err
}
// UnixMicro() returns the number of microseconds since UTC 1970-01-01T00:00:00Z.
return t.UnixMicro(), nil
}
// CompareUnixMicroTz compares two timestamp strings at Unix microsecond precision.
// If both strings are valid and represent the same microsecond moment in time, it returns true.
// Note: It assumes the input strings are guaranteed to be valid as per the requirement.
// If not, it will return an error indicating the invalid input.
func CompareUnixMicroTz(ts1 string, ts2 string, defaultTimezoneStr string) (bool, error) {
// 1. Parse the first timestamp
t1, err := ParseTimeTz(ts1, defaultTimezoneStr)
if err != nil {
return false, fmt.Errorf("error parsing first timestamp '%s': %w", ts1, err)
}
// 2. Parse the second timestamp
t2, err := ParseTimeTz(ts2, defaultTimezoneStr)
if err != nil {
return false, fmt.Errorf("error parsing second timestamp '%s': %w", ts2, err)
}
// 3. Compare their Unix Microsecond values (int64)
// This automatically compares them based on the UTC epoch, regardless of their original location representation.
return t1.UnixMicro() == t2.UnixMicro(), nil
}
// ConvertUnixMicroToTimezoneString converts a Unix microsecond timestamp (UTC epoch)
// into a TZ-aware string formatted as RFC3339Nano, adjusted to the target timezone.
func ConvertUnixMicroToTimezoneString(ts int64, targetTimezoneStr string) (string, error) {
loc, err := time.LoadLocation(targetTimezoneStr)
if err != nil {
return "", fmt.Errorf("invalid target timezone string '%s': %w", targetTimezoneStr, err)
}
// 1. Convert Unix Microsecond (UTC) to a time.Time object (still in UTC).
t := time.UnixMicro(ts).UTC()
// 2. Adjust the time object to the target location.
localTime := t.In(loc)
// 3. Format the result.
return localTime.Format(time.RFC3339Nano), nil
}
// formatTimeMicroWithoutTrailingZeros is an optimized function to format a time.Time
// object. It first truncates the time to microsecond precision (6 digits) and then
// removes all trailing zeros from the fractional seconds part.
//
// Example 1: 2025-03-20T10:30:00.123456000Z -> 2025-03-20T10:30:00.123456Z
// Example 2: 2025-03-20T10:30:00.123000000Z -> 2025-03-20T10:30:00.123Z
// Example 3: 2025-03-20T10:30:00.000000000Z -> 2025-03-20T10:30:00Z
func FormatTimeMicroWithoutTrailingZeros(t time.Time) string {
// 1. Truncate to Microsecond (6 digits max) to ensure we don't exceed the required precision.
tMicro := t.Truncate(time.Microsecond)
// 2. Format the time using the standard high precision format (RFC3339Nano).
// This results in exactly 9 fractional digits, padded with trailing zeros if necessary.
s := tMicro.Format(time.RFC3339Nano)
// 3. Locate the key delimiters ('.' and the Timezone marker 'Z' or '+/-').
dotIndex := strings.LastIndexByte(s, '.')
// Find the Timezone marker index (Z, +, or -)
tzIndex := len(s) - 1
for ; tzIndex >= 0; tzIndex-- {
if s[tzIndex] == 'Z' || s[tzIndex] == '+' || s[tzIndex] == '-' {
break
}
}
// If the format is unexpected, return the original string.
if dotIndex == -1 || tzIndex == -1 {
return s
}
// 4. Extract and efficiently trim the fractional part using bytes.TrimRight.
// Slice the fractional part (e.g., "123456000")
fractionalPart := s[dotIndex+1 : tzIndex]
// Use bytes.TrimRight for efficient removal of trailing '0' characters.
trimmedBytes := bytes.TrimRight([]byte(fractionalPart), "0")
// 5. Reconstruct the final string based on the trimming result.
// Case A: The fractional part was entirely zeros (e.g., .000000000)
if len(trimmedBytes) == 0 {
// Remove the '.' and the fractional part, keep the Timezone marker.
// Result: "2025-03-20T10:30:00Z"
return s[:dotIndex] + s[tzIndex:]
}
// Case B: Fractional part remains (e.g., .123, .123456)
// Recombine: [Time Body] + "." + [Trimmed Fraction] + [Timezone Marker]
// The dot (s[:dotIndex+1]) must be retained here.
return s[:dotIndex+1] + string(trimmedBytes) + s[tzIndex:]
}
// IsTimezoneValid checks if a given string is a valid, recognized timezone name
// (e.g., "Asia/Shanghai" or "UTC").
// It utilizes Go's time.LoadLocation function.
func IsTimezoneValid(tz string) bool {
if tz == "" {
return false
}
_, err := time.LoadLocation(tz)
return err == nil
}
// CheckAndRewriteTimestampTzDefaultValue processes the collection schema to validate
// and rewrite default values for TIMESTAMPTZ fields.
//
// Background:
// 1. TIMESTAMPTZ default values are initially stored as user-provided ISO 8601 strings
// (in ValueField.GetStringData()).
// 2. Milvus stores TIMESTAMPTZ data internally as UTC microseconds (int64).
//
// Logic:
// The function iterates through all fields of type DataType_Timestamptz. For each field
// with a default value:
// 1. It retrieves the collection's default timezone if no offset is present in the string.
// 2. It calls ValidateAndReturnUnixMicroTz to validate the string (including the UTC
// offset range check) and convert it to the absolute UTC microsecond (int64) value.
// 3. It rewrites the ValueField, setting the LongData field with the calculated int64
// value, thereby replacing the initial string representation.
func CheckAndRewriteTimestampTzDefaultValue(schema *schemapb.CollectionSchema) error {
// 1. Get the collection-level default timezone.
// Assuming common.TimezoneKey and common.DefaultTimezone are defined constants.
timezone, exist := funcutil.TryGetAttrByKeyFromRepeatedKV(common.TimezoneKey, schema.GetProperties())
if !exist {
timezone = common.DefaultTimezone
}
for _, fieldSchema := range schema.GetFields() {
// Only process TIMESTAMPTZ fields.
if fieldSchema.GetDataType() != schemapb.DataType_Timestamptz {
continue
}
defaultValue := fieldSchema.GetDefaultValue()
if defaultValue == nil {
continue
}
// 2. Read the default value as a string (the input format).
// We expect the default value to be set in string_data initially.
stringTz := defaultValue.GetStringData()
if stringTz == "" {
// Skip or handle empty string default values if necessary.
continue
}
// 3. Validate the string and convert it to UTC microsecond (int64).
// This also performs the critical UTC offset range validation.
utcMicro, err := ValidateAndReturnUnixMicroTz(stringTz, timezone)
if err != nil {
// If validation fails (e.g., invalid format or illegal offset), return error immediately.
return err
}
// 4. Rewrite the default value to store the UTC microsecond (int64).
// By setting ValueField_LongData, the oneof field in the protobuf structure
// automatically switches from string_data to timestamptz_data(int64).
defaultValue.Data = &schemapb.ValueField_TimestamptzData{
TimestamptzData: utcMicro,
}
// The original string_data field is now cleared due to the oneof nature,
// and the default value is correctly represented as an int64 microsecond value.
}
return nil
}
// CheckAndRewriteTimestampTzDefaultValueForFieldSchema processes a single FieldSchema
// to validate and rewrite the default value specifically for TIMESTAMPTZ fields.
//
// The function ensures the default value (initially a string) is correctly converted
// and stored internally as an absolute UTC microsecond (int64) value.
//
// Parameters:
//
// fieldSchema: The specific FieldSchema object to be processed.
// collectionTimezone: The collection-level default timezone string (e.g., "UTC", "Asia/Shanghai")
// used to parse timestamps without an explicit offset.
//
// Returns:
//
// error: An error if validation fails (e.g., invalid timestamp format or illegal offset range), otherwise nil.
func CheckAndRewriteTimestampTzDefaultValueForFieldSchema(
fieldSchema *schemapb.FieldSchema,
collectionTimezone string,
) error {
defaultValue := fieldSchema.GetDefaultValue()
if defaultValue == nil {
return nil
}
// log.Info("czsKKK111")
// 2. Read the default value as a string (the initial user-provided format).
// The default value is expected to be stored in string_data initially.
stringTz := defaultValue.GetStringData()
if stringTz == "" {
// Skip or handle empty string default values if necessary.
// log.Info("czsKKK222")
return nil
}
// 3. Validate the string and convert it to UTC microsecond (int64).
// The validation function also applies the collectionTimezone if no offset is present
// in the input stringTz, and performs offset range checks.
utcMicro, err := ValidateAndReturnUnixMicroTz(stringTz, collectionTimezone)
if err != nil {
// log.Info("czsKKK333")
// If validation fails (e.g., invalid format or illegal offset), return error immediately.
return err
}
// 4. Rewrite the default value to store the absolute UTC microsecond (int64).
// By setting ValueField_LongData, the oneof field in the protobuf structure
// automatically switches the internal representation from string_data to timestamptz_data(int64).
defaultValue.Data = &schemapb.ValueField_TimestamptzData{
TimestamptzData: utcMicro,
}
fieldSchema.DefaultValue = defaultValue
// log.Info("czsKKK444", zap.Any("utc", fieldSchema.GetDefaultValue()))
return nil
}
// RewriteTimestampTzDefaultValueToString converts the default_value of TIMESTAMPTZ fields
// in the DescribeCollectionResponse from the internal int64 (UTC microsecond) format
// back to a human-readable, timezone-aware string (RFC3339Nano).
//
// This is necessary because TIMESTAMPTZ default values are stored internally as int64
// after validation but must be returned to the user as a string, respecting the
// collection's default timezone for display purposes if no explicit offset was stored.
func RewriteTimestampTzDefaultValueToString(schema *schemapb.CollectionSchema) error {
if schema == nil {
return nil
}
// 1. Determine the target timezone for display.
// This is typically stored in the collection properties.
timezone, exist := funcutil.TryGetAttrByKeyFromRepeatedKV(common.TimezoneKey, schema.GetProperties())
if !exist {
timezone = common.DefaultTimezone // Fallback to a default, like "UTC"
}
// 2. Iterate through all fields in the schema.
for _, fieldSchema := range schema.GetFields() {
// Only process TIMESTAMPTZ fields.
if fieldSchema.GetDataType() != schemapb.DataType_Timestamptz {
continue
}
defaultValue := fieldSchema.GetDefaultValue()
if defaultValue == nil {
continue
}
// 3. Check if the default value is stored in the internal int64 (LongData) format.
// If it's not LongData, we assume it's either unset or already a string (which shouldn't happen
// if the creation flow worked correctly).
utcMicro, ok := defaultValue.GetData().(*schemapb.ValueField_TimestamptzData)
if !ok {
continue // Skip if not stored as LongData (int64)
}
ts := utcMicro.TimestamptzData
// 4. Convert the int64 microsecond value back to a timezone-aware string.
tzString, err := ConvertUnixMicroToTimezoneString(ts, timezone)
if err != nil {
// In a real system, you might log the error and use the raw int64 as a fallback string,
// but here we'll set a placeholder string to avoid crashing.
tzString = fmt.Sprintf("error converting timestamp: %v", err)
return errors.Wrap(err, tzString)
}
// 5. Rewrite the default value field in the response schema.
// The protobuf oneof structure ensures setting one field clears the others.
fieldSchema.GetDefaultValue().Data = &schemapb.ValueField_StringData{
StringData: tzString,
}
}
return nil
}