milvus/pkg/common/common.go
Tianx 2c0c5ef41e
feat: timestamptz expression & index & timezone (#44080)
issue: https://github.com/milvus-io/milvus/issues/27467

>My plan is as follows.
>- [x] M1 Create collection with timestamptz field
>- [x] M2 Insert timestamptz field data
>- [x] M3 Retrieve timestamptz field data
>- [x] M4 Implement handoff
>- [x] M5 Implement compare operator
>- [x] M6 Implement extract operator
 >- [x] M8 Support database/collection level default timezone
>- [x] M7 Support STL-SORT index for datatype timestamptz

---

The third PR of issue: https://github.com/milvus-io/milvus/issues/27467,
which completes M5, M6, M7, M8 described above.

## M8 Default Timezone

We will be able to use alter_collection() and alter_database() in a
future Python SDK release to modify the default timezone at the
collection or database level.

For insert requests, the timezone will be resolved using the following
order of precedence: String Literal-> Collection Default -> Database
Default.
For retrieval requests, the timezone will be resolved in this order:
Query Parameters -> Collection Default -> Database Default.
In both cases, the final fallback timezone is UTC.


## M5: Comparison Operators

We can now use the following expression format to filter on the
timestamptz field:

- `timestamptz_field [+/- INTERVAL 'interval_string'] {comparison_op}
ISO 'iso_string' `

- The interval_string follows the ISO 8601 duration format, for example:
P1Y2M3DT1H2M3S.

- The iso_string follows the ISO 8601 timestamp format, for example:
2025-01-03T00:00:00+08:00.

- Example expressions: "tsz + INTERVAL 'P0D' != ISO
'2025-01-03T00:00:00+08:00'" or "tsz != ISO
'2025-01-03T00:00:00+08:00'".

## M6: Extract

We will be able to extract sepecific time filed by kwargs in a future
Python SDK release.
The key is `time_fields`, and value should be one or more of "year,
month, day, hour, minute, second, microsecond", seperated by comma or
space. Then the result of each record would be an array of int64.



## M7: Indexing Support

Expressions without interval arithmetic can be accelerated using an
STL-SORT index. However, expressions that include interval arithmetic
cannot be indexed. This is because the result of an interval calculation
depends on the specific timestamp value. For example, adding one month
to a date in February results in a different number of added days than
adding one month to a date in March.

--- 

After this PR, the input / output type of timestamptz would be iso
string. Timestampz would be stored as timestamptz data, which is int64_t
finally.

> for more information, see https://en.wikipedia.org/wiki/ISO_8601

---------

Signed-off-by: xtx <xtianx@smail.nju.edu.cn>
2025-09-23 10:24:12 +08:00

546 lines
16 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package common
import (
"encoding/binary"
"fmt"
"math/bits"
"strconv"
"strings"
"github.com/cockroachdb/errors"
"github.com/samber/lo"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/log"
)
// system field id:
// 0: unique row id
// 1: timestamp
// 100: first user field id
// 101: second user field id
// 102: ...
const (
// StartOfUserFieldID represents the starting ID of the user-defined field
StartOfUserFieldID = 100
// StartOfUserFunctionID represents the starting ID of the user-defined function
StartOfUserFunctionID = 100
// RowIDField is the ID of the RowID field reserved by the system
RowIDField = 0
// TimeStampField is the ID of the Timestamp field reserved by the system
TimeStampField = 1
// RowIDFieldName defines the name of the RowID field
RowIDFieldName = "RowID"
// TimeStampFieldName defines the name of the Timestamp field
TimeStampFieldName = "Timestamp"
// NamespaceFieldName defines the name of the Namespace field
NamespaceFieldName = "$namespace_id"
// MetaFieldName is the field name of dynamic schema
MetaFieldName = "$meta"
// DefaultShardsNum defines the default number of shards when creating a collection
DefaultShardsNum = int32(1)
// DefaultPartitionsWithPartitionKey defines the default number of partitions when use partition key
DefaultPartitionsWithPartitionKey = int64(16)
// InvalidPartitionID indicates that the partition is not specified. It will be set when the partitionName is empty
InvalidPartitionID = int64(-1)
// AllPartitionsID indicates data applies to all partitions.
AllPartitionsID = int64(-1)
// InvalidFieldID indicates that the field does not exist . It will be set when the field is not found.
InvalidFieldID = int64(-1)
// NotRegisteredID means node is not registered into etcd.
NotRegisteredID = int64(-1)
// InvalidNodeID indicates that node is not valid in querycoord replica or shard cluster.
InvalidNodeID = int64(-1)
SystemFieldsNum = int64(2)
)
const (
MinimalScalarIndexEngineVersion = int32(0)
CurrentScalarIndexEngineVersion = int32(2)
)
// Endian is type alias of binary.LittleEndian.
// Milvus uses little endian by default.
var Endian = binary.LittleEndian
const (
// SegmentInsertLogPath storage path const for segment insert binlog.
SegmentInsertLogPath = `insert_log`
// SegmentDeltaLogPath storage path const for segment delta log.
SegmentDeltaLogPath = `delta_log`
// SegmentStatslogPath storage path const for segment stats log.
SegmentStatslogPath = `stats_log`
// SegmentIndexPath storage path const for segment index files.
SegmentIndexPath = `index_files`
// SegmentBm25LogPath storage path const for bm25 statistic
SegmentBm25LogPath = `bm25_stats`
// PartitionStatsPath storage path const for partition stats files
PartitionStatsPath = `part_stats`
// AnalyzeStatsPath storage path const for analyze.
AnalyzeStatsPath = `analyze_stats`
OffsetMapping = `offset_mapping`
Centroids = "centroids"
// TextIndexPath storage path const for text index
TextIndexPath = "text_log"
// JSONIndexPath storage path const for json index
JSONIndexPath = "json_key_index_log"
// JSONStatsPath storage path const for json stats
JSONStatsPath = "json_stats"
)
const (
JSONStatsDataFormatVersion = 2
)
// Search, Index parameter keys
const (
TopKKey = "topk"
SearchParamKey = "search_param"
SegmentNumKey = "segment_num"
WithFilterKey = "with_filter"
DataTypeKey = "data_type"
ChannelNumKey = "channel_num"
WithOptimizeKey = "with_optimize"
CollectionKey = "collection"
RecallEvalKey = "recall_eval"
ParamsKey = "params"
IndexTypeKey = "index_type"
MetricTypeKey = "metric_type"
DimKey = "dim"
MaxLengthKey = "max_length"
MaxCapacityKey = "max_capacity"
DropRatioBuildKey = "drop_ratio_build"
IsSparseKey = "is_sparse"
AutoIndexName = "AUTOINDEX"
BitmapCardinalityLimitKey = "bitmap_cardinality_limit"
IgnoreGrowing = "ignore_growing"
ConsistencyLevel = "consistency_level"
HintsKey = "hints"
JSONCastTypeKey = "json_cast_type"
JSONPathKey = "json_path"
JSONCastFunctionKey = "json_cast_function"
)
// expr query params
const (
ExprUseJSONStatsKey = "expr_use_json_stats"
)
// Doc-in-doc-out
const (
EnableAnalyzerKey = `enable_analyzer`
AnalyzerParamKey = `analyzer_params`
)
// Collection properties key
const (
CollectionTTLConfigKey = "collection.ttl.seconds"
CollectionAutoCompactionKey = "collection.autocompaction.enabled"
CollectionDescription = "collection.description"
// rate limit
CollectionInsertRateMaxKey = "collection.insertRate.max.mb"
CollectionInsertRateMinKey = "collection.insertRate.min.mb"
CollectionUpsertRateMaxKey = "collection.upsertRate.max.mb"
CollectionUpsertRateMinKey = "collection.upsertRate.min.mb"
CollectionDeleteRateMaxKey = "collection.deleteRate.max.mb"
CollectionDeleteRateMinKey = "collection.deleteRate.min.mb"
CollectionBulkLoadRateMaxKey = "collection.bulkLoadRate.max.mb"
CollectionBulkLoadRateMinKey = "collection.bulkLoadRate.min.mb"
CollectionQueryRateMaxKey = "collection.queryRate.max.qps"
CollectionQueryRateMinKey = "collection.queryRate.min.qps"
CollectionSearchRateMaxKey = "collection.searchRate.max.vps"
CollectionSearchRateMinKey = "collection.searchRate.min.vps"
CollectionDiskQuotaKey = "collection.diskProtection.diskQuota.mb"
PartitionDiskQuotaKey = "partition.diskProtection.diskQuota.mb"
// database level properties
DatabaseReplicaNumber = "database.replica.number"
DatabaseResourceGroups = "database.resource_groups"
DatabaseDiskQuotaKey = "database.diskQuota.mb"
DatabaseMaxCollectionsKey = "database.max.collections"
DatabaseForceDenyWritingKey = "database.force.deny.writing"
DatabaseForceDenyReadingKey = "database.force.deny.reading"
DatabaseForceDenyDDLKey = "database.force.deny.ddl" // all ddl
DatabaseForceDenyCollectionDDLKey = "database.force.deny.collectionDDL"
DatabaseForceDenyPartitionDDLKey = "database.force.deny.partitionDDL"
DatabaseForceDenyIndexDDLKey = "database.force.deny.index"
DatabaseForceDenyFlushDDLKey = "database.force.deny.flush"
DatabaseForceDenyCompactionDDLKey = "database.force.deny.compaction"
// collection level load properties
CollectionReplicaNumber = "collection.replica.number"
CollectionResourceGroups = "collection.resource_groups"
)
// common properties
const (
MmapEnabledKey = "mmap.enabled"
LazyLoadEnableKey = "lazyload.enabled"
LoadPriorityKey = "load_priority"
PartitionKeyIsolationKey = "partitionkey.isolation"
FieldSkipLoadKey = "field.skipLoad"
IndexOffsetCacheEnabledKey = "indexoffsetcache.enabled"
ReplicateIDKey = "replicate.id"
ReplicateEndTSKey = "replicate.endTS"
IndexNonEncoding = "index.nonEncoding"
EnableDynamicSchemaKey = `dynamicfield.enabled`
NamespaceEnabledKey = "namespace.enabled"
// timezone releated
DatabaseDefaultTimezone = "database.timezone"
CollectionDefaultTimezone = "collection.timezone"
)
const (
PropertiesKey string = "properties"
TraceIDKey string = "uber-trace-id"
ClientRequestMsecKey string = "client-request-unixmsec"
)
// Timestamptz field
const (
TszYear string = "year"
TszMonth string = "month"
TszDay string = "day"
TszHour string = "hour"
TszMinute string = "minute"
TszSecond string = "second"
TszMicrosecond string = "microsecond"
)
func IsSystemField(fieldID int64) bool {
return fieldID < StartOfUserFieldID
}
func IsMmapDataEnabled(kvs ...*commonpb.KeyValuePair) (bool, bool) {
for _, kv := range kvs {
if kv.Key == MmapEnabledKey {
enable, _ := strconv.ParseBool(kv.Value)
return enable, true
}
}
return false, false
}
func IsMmapIndexEnabled(kvs ...*commonpb.KeyValuePair) (bool, bool) {
for _, kv := range kvs {
if kv.Key == MmapEnabledKey {
enable, _ := strconv.ParseBool(kv.Value)
return enable, true
}
}
return false, false
}
func GetIndexType(indexParams []*commonpb.KeyValuePair) string {
for _, param := range indexParams {
if param.Key == IndexTypeKey {
return param.Value
}
}
log.Warn("IndexType not found in indexParams")
return ""
}
func FieldHasMmapKey(schema *schemapb.CollectionSchema, fieldID int64) bool {
for _, field := range schema.GetFields() {
if field.GetFieldID() == fieldID {
for _, kv := range field.GetTypeParams() {
if kv.Key == MmapEnabledKey {
return true
}
}
return false
}
}
return false
}
func HasLazyload(props []*commonpb.KeyValuePair) bool {
for _, kv := range props {
if kv.Key == LazyLoadEnableKey {
return true
}
}
return false
}
func IsCollectionLazyLoadEnabled(kvs ...*commonpb.KeyValuePair) bool {
for _, kv := range kvs {
if kv.Key == LazyLoadEnableKey && strings.ToLower(kv.Value) == "true" {
return true
}
}
return false
}
func IsPartitionKeyIsolationKvEnabled(kvs ...*commonpb.KeyValuePair) (bool, error) {
for _, kv := range kvs {
if kv.Key == PartitionKeyIsolationKey {
val, err := strconv.ParseBool(strings.ToLower(kv.Value))
if err != nil {
return false, errors.Wrap(err, "failed to parse partition key isolation")
}
return val, nil
}
}
return false, nil
}
func IsPartitionKeyIsolationPropEnabled(props map[string]string) (bool, error) {
val, ok := props[PartitionKeyIsolationKey]
if !ok {
return false, nil
}
iso, parseErr := strconv.ParseBool(val)
if parseErr != nil {
return false, errors.Wrap(parseErr, "failed to parse partition key isolation property")
}
return iso, nil
}
const (
// LatestVerision is the magic number for watch latest revision
LatestRevision = int64(-1)
)
func DatabaseLevelReplicaNumber(kvs []*commonpb.KeyValuePair) (int64, error) {
for _, kv := range kvs {
if kv.Key == DatabaseReplicaNumber {
replicaNum, err := strconv.ParseInt(kv.Value, 10, 64)
if err != nil {
return 0, fmt.Errorf("invalid database property: [key=%s] [value=%s]", kv.Key, kv.Value)
}
return replicaNum, nil
}
}
return 0, fmt.Errorf("database property not found: %s", DatabaseReplicaNumber)
}
func DatabaseLevelResourceGroups(kvs []*commonpb.KeyValuePair) ([]string, error) {
for _, kv := range kvs {
if kv.Key == DatabaseResourceGroups {
invalidPropValue := fmt.Errorf("invalid database property: [key=%s] [value=%s]", kv.Key, kv.Value)
if len(kv.Value) == 0 {
return nil, invalidPropValue
}
rgs := strings.Split(kv.Value, ",")
if len(rgs) == 0 {
return nil, invalidPropValue
}
return lo.Map(rgs, func(rg string, _ int) string { return strings.TrimSpace(rg) }), nil
}
}
return nil, fmt.Errorf("database property not found: %s", DatabaseResourceGroups)
}
func CollectionLevelReplicaNumber(kvs []*commonpb.KeyValuePair) (int64, error) {
for _, kv := range kvs {
if kv.Key == CollectionReplicaNumber {
replicaNum, err := strconv.ParseInt(kv.Value, 10, 64)
if err != nil {
return 0, fmt.Errorf("invalid collection property: [key=%s] [value=%s]", kv.Key, kv.Value)
}
return replicaNum, nil
}
}
return 0, fmt.Errorf("collection property not found: %s", CollectionReplicaNumber)
}
func CollectionLevelResourceGroups(kvs []*commonpb.KeyValuePair) ([]string, error) {
for _, kv := range kvs {
if kv.Key == CollectionResourceGroups {
invalidPropValue := fmt.Errorf("invalid collection property: [key=%s] [value=%s]", kv.Key, kv.Value)
if len(kv.Value) == 0 {
return nil, invalidPropValue
}
rgs := strings.Split(kv.Value, ",")
if len(rgs) == 0 {
return nil, invalidPropValue
}
return lo.Map(rgs, func(rg string, _ int) string { return strings.TrimSpace(rg) }), nil
}
}
return nil, fmt.Errorf("collection property not found: %s", CollectionReplicaNumber)
}
// GetCollectionLoadFields returns the load field ids according to the type params.
func GetCollectionLoadFields(schema *schemapb.CollectionSchema, skipDynamicField bool) []int64 {
filter := func(field *schemapb.FieldSchema, _ int) (int64, bool) {
// skip system field
if IsSystemField(field.GetFieldID()) {
return field.GetFieldID(), false
}
// skip dynamic field if specified
if field.IsDynamic && skipDynamicField {
return field.GetFieldID(), false
}
v, err := ShouldFieldBeLoaded(field.GetTypeParams())
if err != nil {
log.Warn("type param parse skip load failed", zap.Error(err))
// if configuration cannot be parsed, ignore it and load field
return field.GetFieldID(), true
}
return field.GetFieldID(), v
}
fields := lo.FilterMap(schema.GetFields(), filter)
fieldsNum := len(schema.GetFields())
for _, structField := range schema.GetStructArrayFields() {
fields = append(fields, lo.FilterMap(structField.GetFields(), filter)...)
fieldsNum += len(structField.GetFields())
}
// empty fields list means all fields will be loaded
if len(fields) == fieldsNum-int(SystemFieldsNum) {
return []int64{}
}
return fields
}
func ShouldFieldBeLoaded(kvs []*commonpb.KeyValuePair) (bool, error) {
for _, kv := range kvs {
if kv.GetKey() == FieldSkipLoadKey {
val, err := strconv.ParseBool(kv.GetValue())
return !val, err
}
}
return true, nil
}
func IsReplicateEnabled(kvs []*commonpb.KeyValuePair) (bool, bool) {
replicateID, ok := GetReplicateID(kvs)
return replicateID != "", ok
}
func GetReplicateID(kvs []*commonpb.KeyValuePair) (string, bool) {
for _, kv := range kvs {
if kv.GetKey() == ReplicateIDKey {
return kv.GetValue(), true
}
}
return "", false
}
func GetReplicateEndTS(kvs []*commonpb.KeyValuePair) (uint64, bool) {
for _, kv := range kvs {
if kv.GetKey() == ReplicateEndTSKey {
ts, err := strconv.ParseUint(kv.GetValue(), 10, 64)
if err != nil {
log.Warn("parse replicate end ts failed", zap.Error(err), zap.Stack("stack"))
return 0, false
}
return ts, true
}
}
return 0, false
}
func IsEnableDynamicSchema(kvs []*commonpb.KeyValuePair) (found bool, value bool, err error) {
for _, kv := range kvs {
if kv.GetKey() == EnableDynamicSchemaKey {
value, err = strconv.ParseBool(kv.GetValue())
return true, value, err
}
}
return false, false, nil
}
func ValidateAutoIndexMmapConfig(autoIndexConfigEnable, isVectorField bool, indexParams map[string]string) error {
if !autoIndexConfigEnable {
return nil
}
_, ok := indexParams[MmapEnabledKey]
if ok && isVectorField {
return errors.New("mmap index is not supported to config for the collection in auto index mode")
}
return nil
}
func ParseNamespaceProp(props ...*commonpb.KeyValuePair) (value bool, has bool, err error) {
for _, p := range props {
if p.GetKey() == NamespaceEnabledKey {
value, err := strconv.ParseBool(p.GetValue())
if err != nil {
return false, false, fmt.Errorf("invalid namespace prop value: %s", p.GetValue())
}
return value, true, nil
}
}
return false, false, nil
}
func AllocAutoID(allocFunc func(uint32) (int64, int64, error), rowNum uint32, clusterID uint64) (int64, int64, error) {
idStart, idEnd, err := allocFunc(rowNum)
if err != nil {
return 0, 0, err
}
reversed := bits.Reverse64(clusterID)
// right shift by 1 to preserve sign bit
reversed = reversed >> 1
return idStart | int64(reversed), idEnd | int64(reversed), nil
}