enhance: Add json key inverted index in stats for optimization (#38039)

Add json key inverted index in stats for optimization
https://github.com/milvus-io/milvus/issues/36995

---------

Signed-off-by: Xianhui.Lin <xianhui.lin@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
Xianhui Lin 2025-04-10 15:20:28 +08:00 committed by GitHub
parent a308d2c886
commit 3bc24c264f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
133 changed files with 10969 additions and 4913 deletions

View File

@ -414,6 +414,7 @@ queryNode:
buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num
multipleChunkedEnable: true # Enable multiple chunked search multipleChunkedEnable: true # Enable multiple chunked search
knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic
jsonKeyStatsCommitInterval: 200 # the commit interval for the JSON key Stats to commit
loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments
enableDisk: false # enable querynode load disk index, and search on disk index enableDisk: false # enable querynode load disk index, and search on disk index
maxDiskUsagePercentage: 95 maxDiskUsagePercentage: 95
@ -636,6 +637,10 @@ dataCoord:
indexTaskSlotUsage: 64 # slot usage of index task per 512mb indexTaskSlotUsage: 64 # slot usage of index task per 512mb
statsTaskSlotUsage: 8 # slot usage of stats task per 512mb statsTaskSlotUsage: 8 # slot usage of stats task per 512mb
analyzeTaskSlotUsage: 65535 # slot usage of analyze task analyzeTaskSlotUsage: 65535 # slot usage of analyze task
jsonStatsTriggerCount: 10 # jsonkey stats task count per trigger
jsonStatsTriggerInterval: 10 # jsonkey task interval per trigger
enabledJSONKeyStatsInSort: false # Indicates whether to enable JSON key stats task with sort
jsonKeyStatsMemoryBudgetInTantivy: 16777216 # the memory budget for the JSON index In Tantivy, the unit is bytes
ip: # TCP/IP address of dataCoord. If not specified, use the first unicastable address ip: # TCP/IP address of dataCoord. If not specified, use the first unicastable address
port: 13333 # TCP port of dataCoord port: 13333 # TCP port of dataCoord
grpc: grpc:
@ -891,6 +896,8 @@ common:
sync: sync:
taskPoolReleaseTimeoutSeconds: 60 # The maximum time to wait for the task to finish and release resources in the pool taskPoolReleaseTimeoutSeconds: 60 # The maximum time to wait for the task to finish and release resources in the pool
enabledOptimizeExpr: true # Indicates whether to enable optimize expr enabledOptimizeExpr: true # Indicates whether to enable optimize expr
enabledJSONKeyStats: false # Indicates sealedsegment whether to enable JSON key stats
enabledGrowingSegmentJSONKeyStats: false # Indicates growingsegment whether to enable JSON key stats
# QuotaConfig, configurations of Milvus quota and limits. # QuotaConfig, configurations of Milvus quota and limits.
# By default, we enable: # By default, we enable:

View File

@ -30,6 +30,9 @@ int CPU_NUM = DEFAULT_CPU_NUM;
int64_t EXEC_EVAL_EXPR_BATCH_SIZE = DEFAULT_EXEC_EVAL_EXPR_BATCH_SIZE; int64_t EXEC_EVAL_EXPR_BATCH_SIZE = DEFAULT_EXEC_EVAL_EXPR_BATCH_SIZE;
bool OPTIMIZE_EXPR_ENABLED = DEFAULT_OPTIMIZE_EXPR_ENABLED; bool OPTIMIZE_EXPR_ENABLED = DEFAULT_OPTIMIZE_EXPR_ENABLED;
int64_t JSON_KEY_STATS_COMMIT_INTERVAL = DEFAULT_JSON_KEY_STATS_COMMIT_INTERVAL;
bool GROWING_JSON_KEY_STATS_ENABLED = DEFAULT_GROWING_JSON_KEY_STATS_ENABLED;
void void
SetIndexSliceSize(const int64_t size) { SetIndexSliceSize(const int64_t size) {
FILE_SLICE_SIZE = size << 20; FILE_SLICE_SIZE = size << 20;
@ -74,4 +77,18 @@ SetDefaultOptimizeExprEnable(bool val) {
LOG_INFO("set default optimize expr enabled: {}", OPTIMIZE_EXPR_ENABLED); LOG_INFO("set default optimize expr enabled: {}", OPTIMIZE_EXPR_ENABLED);
} }
void
SetDefaultJSONKeyStatsCommitInterval(int64_t val) {
JSON_KEY_STATS_COMMIT_INTERVAL = val;
LOG_INFO("set default json key Stats commit interval: {}",
JSON_KEY_STATS_COMMIT_INTERVAL);
}
void
SetDefaultGrowingJSONKeyStatsEnable(bool val) {
GROWING_JSON_KEY_STATS_ENABLED = val;
LOG_INFO("set default growing json key index enable: {}",
GROWING_JSON_KEY_STATS_ENABLED);
}
} // namespace milvus } // namespace milvus

View File

@ -29,8 +29,9 @@ extern int64_t MIDDLE_PRIORITY_THREAD_CORE_COEFFICIENT;
extern int64_t LOW_PRIORITY_THREAD_CORE_COEFFICIENT; extern int64_t LOW_PRIORITY_THREAD_CORE_COEFFICIENT;
extern int CPU_NUM; extern int CPU_NUM;
extern int64_t EXEC_EVAL_EXPR_BATCH_SIZE; extern int64_t EXEC_EVAL_EXPR_BATCH_SIZE;
extern int64_t JSON_KEY_STATS_COMMIT_INTERVAL;
extern bool OPTIMIZE_EXPR_ENABLED; extern bool OPTIMIZE_EXPR_ENABLED;
extern bool GROWING_JSON_KEY_STATS_ENABLED;
void void
SetIndexSliceSize(const int64_t size); SetIndexSliceSize(const int64_t size);
@ -52,6 +53,12 @@ SetDefaultExecEvalExprBatchSize(int64_t val);
void void
SetDefaultOptimizeExprEnable(bool val); SetDefaultOptimizeExprEnable(bool val);
void
SetDefaultJSONKeyStatsCommitInterval(int64_t val);
void
SetDefaultGrowingJSONKeyStatsEnable(bool val);
struct BufferView { struct BufferView {
struct Element { struct Element {
const char* data_; const char* data_;

View File

@ -49,6 +49,7 @@ const char PAGE_RETAIN_ORDER[] = "page_retain_order";
const char TEXT_LOG_ROOT_PATH[] = "text_log"; const char TEXT_LOG_ROOT_PATH[] = "text_log";
const char ITERATIVE_FILTER[] = "iterative_filter"; const char ITERATIVE_FILTER[] = "iterative_filter";
const char HINTS[] = "hints"; const char HINTS[] = "hints";
const char JSON_KEY_INDEX_LOG_ROOT_PATH[] = "json_key_index_log";
const char DEFAULT_PLANNODE_ID[] = "0"; const char DEFAULT_PLANNODE_ID[] = "0";
const char DEAFULT_QUERY_ID[] = "0"; const char DEAFULT_QUERY_ID[] = "0";
@ -82,3 +83,6 @@ const std::string JSON_CAST_TYPE = "json_cast_type";
const std::string JSON_PATH = "json_path"; const std::string JSON_PATH = "json_path";
const bool DEFAULT_OPTIMIZE_EXPR_ENABLED = true; const bool DEFAULT_OPTIMIZE_EXPR_ENABLED = true;
const int64_t DEFAULT_CONVERT_OR_TO_IN_NUMERIC_LIMIT = 150; const int64_t DEFAULT_CONVERT_OR_TO_IN_NUMERIC_LIMIT = 150;
const int64_t DEFAULT_JSON_INDEX_MEMORY_BUDGET = 16777216; // bytes, 16MB
const bool DEFAULT_GROWING_JSON_KEY_STATS_ENABLED = false;
const int64_t DEFAULT_JSON_KEY_STATS_COMMIT_INTERVAL = 200;

View File

@ -12,7 +12,7 @@
#include "common/FieldMeta.h" #include "common/FieldMeta.h"
#include "common/SystemProperty.h" #include "common/SystemProperty.h"
#include "common/protobuf_utils.h" #include "common/protobuf_utils.h"
#include "common/Common.h"
#include <boost/lexical_cast.hpp> #include <boost/lexical_cast.hpp>
#include <optional> #include <optional>
@ -39,6 +39,11 @@ FieldMeta::enable_match() const {
return string_info_->enable_match; return string_info_->enable_match;
} }
bool
FieldMeta::enable_growing_jsonStats() const {
return IsJsonDataType(type_) && GROWING_JSON_KEY_STATS_ENABLED;
}
bool bool
FieldMeta::enable_analyzer() const { FieldMeta::enable_analyzer() const {
if (!IsStringDataType(type_)) { if (!IsStringDataType(type_)) {

View File

@ -148,6 +148,9 @@ class FieldMeta {
bool bool
enable_analyzer() const; enable_analyzer() const;
bool
enable_growing_jsonStats() const;
TokenizerParams TokenizerParams
get_analyzer_params() const; get_analyzer_params() const;

View File

@ -149,6 +149,25 @@ class Json {
return doc; return doc;
} }
value_result<document>
doc(uint16_t offset, uint16_t length) const {
thread_local simdjson::ondemand::parser parser;
// it's always safe to add the padding,
// as we have allocated the memory with this padding
auto doc = parser.iterate(
data_.data() + offset, length, length + simdjson::SIMDJSON_PADDING);
AssertInfo(doc.error() == simdjson::SUCCESS,
"failed to parse the json {} offset {}, length {}: {}, "
"total_json:{}",
std::string(data_.data() + offset, length),
offset,
length,
simdjson::error_message(doc.error()),
data_);
return doc;
}
value_result<simdjson::dom::element> value_result<simdjson::dom::element>
dom_doc() const { dom_doc() const {
if (data_.size() == 0) { if (data_.size() == 0) {
@ -166,6 +185,20 @@ class Json {
return doc; return doc;
} }
value_result<simdjson::dom::element>
dom_doc(uint16_t offset, uint16_t length) const {
thread_local simdjson::dom::parser parser;
// it's always safe to add the padding,
// as we have allocated the memory with this padding
auto doc = parser.parse(data_.data() + offset, length);
AssertInfo(doc.error() == simdjson::SUCCESS,
"failed to parse the json {}: {}",
std::string(data_.data() + offset, length),
simdjson::error_message(doc.error()));
return doc;
}
bool bool
exist(std::string_view pointer) const { exist(std::string_view pointer) const {
auto doc = this->doc(); auto doc = this->doc();
@ -207,6 +240,22 @@ class Json {
return doc().at_pointer(pointer).get<T>(); return doc().at_pointer(pointer).get<T>();
} }
template <typename T>
value_result<T>
at(uint16_t offset, uint16_t length) const {
return doc(offset, length).get<T>();
}
std::string_view
at_string(uint16_t offset, uint16_t length) const {
return std::string_view(data_.data() + offset, length);
}
value_result<simdjson::dom::array>
array_at(uint16_t offset, uint16_t length) const {
return dom_doc(offset, length).get_array();
}
// get dom array by JSON pointer, // get dom array by JSON pointer,
// call `size()` to get array size, // call `size()` to get array size,
// call `at()` to get array element by index, // call `at()` to get array element by index,

View File

@ -25,7 +25,7 @@
#include "common/Tracer.h" #include "common/Tracer.h"
#include "log/Log.h" #include "log/Log.h"
std::once_flag flag1, flag2, flag3, flag4, flag5, flag6, flag7; std::once_flag flag1, flag2, flag3, flag4, flag5, flag6, flag7, flag8, flag9;
std::once_flag traceFlag; std::once_flag traceFlag;
void void
@ -86,6 +86,22 @@ InitDefaultOptimizeExprEnable(bool val) {
val); val);
} }
void
InitDefaultJSONKeyStatsCommitInterval(int64_t val) {
std::call_once(
flag8,
[](int val) { milvus::SetDefaultJSONKeyStatsCommitInterval(val); },
val);
}
void
InitDefaultGrowingJSONKeyStatsEnable(bool val) {
std::call_once(
flag9,
[](bool val) { milvus::SetDefaultGrowingJSONKeyStatsEnable(val); },
val);
}
void void
InitTrace(CTraceConfig* config) { InitTrace(CTraceConfig* config) {
auto traceConfig = milvus::tracer::TraceConfig{config->exporter, auto traceConfig = milvus::tracer::TraceConfig{config->exporter,

View File

@ -51,6 +51,12 @@ SetTrace(CTraceConfig* config);
void void
InitDefaultOptimizeExprEnable(bool val); InitDefaultOptimizeExprEnable(bool val);
void
InitDefaultJSONKeyStatsCommitInterval(int64_t val);
void
InitDefaultGrowingJSONKeyStatsEnable(bool val);
#ifdef __cplusplus #ifdef __cplusplus
}; };
#endif #endif

View File

@ -0,0 +1,509 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
/*
* MIT License
*
* Copyright (c) 2010 Serge Zaitsev
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef JSMN_H
#define JSMN_H
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
#define JSMN_STATIC
#ifdef JSMN_STATIC
#define JSMN_API static
#else
#define JSMN_API extern
#endif
/**
* JSON type identifier. Basic types are:
* o Object
* o Array
* o String
* o Other primitive: number, boolean (true/false) or null
*/
typedef enum {
JSMN_UNDEFINED = 0,
JSMN_OBJECT = 1 << 0,
JSMN_ARRAY = 1 << 1,
JSMN_STRING = 1 << 2,
JSMN_PRIMITIVE = 1 << 3
} jsmntype_t;
enum jsmnerr {
/* Not enough tokens were provided */
JSMN_ERROR_NOMEM = -1,
/* Invalid character inside JSON string */
JSMN_ERROR_INVAL = -2,
/* The string is not a full JSON packet, more bytes expected */
JSMN_ERROR_PART = -3
};
/**
* JSON token description.
* type type (object, array, string etc.)
* start start position in JSON data string
* end end position in JSON data string
*/
typedef struct jsmntok {
jsmntype_t type;
int start;
int end;
int size;
#ifdef JSMN_PARENT_LINKS
int parent;
#endif
} jsmntok_t;
/**
* JSON parser. Contains an array of token blocks available. Also stores
* the string being parsed now and current position in that string.
*/
typedef struct jsmn_parser {
unsigned int pos; /* offset in the JSON string */
unsigned int toknext; /* next token to allocate */
int toksuper; /* superior token node, e.g. parent object or array */
} jsmn_parser;
/**
* Create JSON parser over an array of tokens
*/
JSMN_API void
jsmn_init(jsmn_parser* parser);
/**
* Run JSON parser. It parses a JSON data string into and array of tokens, each
* describing
* a single JSON object.
*/
JSMN_API int
jsmn_parse(jsmn_parser* parser,
const char* js,
const size_t len,
jsmntok_t* tokens,
const unsigned int num_tokens);
#ifndef JSMN_HEADER
/**
* Allocates a fresh unused token from the token pool.
*/
static jsmntok_t*
jsmn_alloc_token(jsmn_parser* parser,
jsmntok_t* tokens,
const size_t num_tokens) {
jsmntok_t* tok;
if (parser->toknext >= num_tokens) {
return NULL;
}
tok = &tokens[parser->toknext++];
tok->start = tok->end = -1;
tok->size = 0;
#ifdef JSMN_PARENT_LINKS
tok->parent = -1;
#endif
return tok;
}
/**
* Fills token type and boundaries.
*/
static void
jsmn_fill_token(jsmntok_t* token,
const jsmntype_t type,
const int start,
const int end) {
token->type = type;
token->start = start;
token->end = end;
token->size = 0;
}
/**
* Fills next available token with JSON primitive.
*/
static int
jsmn_parse_primitive(jsmn_parser* parser,
const char* js,
const size_t len,
jsmntok_t* tokens,
const size_t num_tokens) {
jsmntok_t* token;
int start;
start = parser->pos;
for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
switch (js[parser->pos]) {
#ifndef JSMN_STRICT
/* In strict mode primitive must be followed by "," or "}" or "]" */
case ':':
#endif
case '\t':
case '\r':
case '\n':
case ' ':
case ',':
case ']':
case '}':
goto found;
default:
/* to quiet a warning from gcc*/
break;
}
if (js[parser->pos] < 32 || js[parser->pos] >= 127) {
parser->pos = start;
return JSMN_ERROR_INVAL;
}
}
#ifdef JSMN_STRICT
/* In strict mode primitive must be followed by a comma/object/array */
parser->pos = start;
return JSMN_ERROR_PART;
#endif
found:
if (tokens == NULL) {
parser->pos--;
return 0;
}
token = jsmn_alloc_token(parser, tokens, num_tokens);
if (token == NULL) {
parser->pos = start;
return JSMN_ERROR_NOMEM;
}
jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos);
#ifdef JSMN_PARENT_LINKS
token->parent = parser->toksuper;
#endif
parser->pos--;
return 0;
}
/**
* Fills next token with JSON string.
*/
static int
jsmn_parse_string(jsmn_parser* parser,
const char* js,
const size_t len,
jsmntok_t* tokens,
const size_t num_tokens) {
jsmntok_t* token;
int start = parser->pos;
/* Skip starting quote */
parser->pos++;
for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
char c = js[parser->pos];
/* Quote: end of string */
if (c == '\"') {
if (tokens == NULL) {
return 0;
}
token = jsmn_alloc_token(parser, tokens, num_tokens);
if (token == NULL) {
parser->pos = start;
return JSMN_ERROR_NOMEM;
}
jsmn_fill_token(token, JSMN_STRING, start + 1, parser->pos);
#ifdef JSMN_PARENT_LINKS
token->parent = parser->toksuper;
#endif
return 0;
}
/* Backslash: Quoted symbol expected */
if (c == '\\' && parser->pos + 1 < len) {
int i;
parser->pos++;
switch (js[parser->pos]) {
/* Allowed escaped symbols */
case '\"':
case '/':
case '\\':
case 'b':
case 'f':
case 'r':
case 'n':
case 't':
break;
/* Allows escaped symbol \uXXXX */
case 'u':
parser->pos++;
for (i = 0;
i < 4 && parser->pos < len && js[parser->pos] != '\0';
i++) {
/* If it isn't a hex character we have an error */
if (!((js[parser->pos] >= 48 &&
js[parser->pos] <= 57) || /* 0-9 */
(js[parser->pos] >= 65 &&
js[parser->pos] <= 70) || /* A-F */
(js[parser->pos] >= 97 &&
js[parser->pos] <= 102))) { /* a-f */
parser->pos = start;
return JSMN_ERROR_INVAL;
}
parser->pos++;
}
parser->pos--;
break;
/* Unexpected symbol */
default:
parser->pos = start;
return JSMN_ERROR_INVAL;
}
}
}
parser->pos = start;
return JSMN_ERROR_PART;
}
/**
* Parse JSON string and fill tokens.
*/
JSMN_API int
jsmn_parse(jsmn_parser* parser,
const char* js,
const size_t len,
jsmntok_t* tokens,
const unsigned int num_tokens) {
int r;
int i;
jsmntok_t* token;
int count = parser->toknext;
for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
char c;
jsmntype_t type;
c = js[parser->pos];
switch (c) {
case '{':
case '[':
count++;
if (tokens == NULL) {
break;
}
token = jsmn_alloc_token(parser, tokens, num_tokens);
if (token == NULL) {
return JSMN_ERROR_NOMEM;
}
if (parser->toksuper != -1) {
jsmntok_t* t = &tokens[parser->toksuper];
#ifdef JSMN_STRICT
/* In strict mode an object or array can't become a key */
if (t->type == JSMN_OBJECT) {
return JSMN_ERROR_INVAL;
}
#endif
t->size++;
#ifdef JSMN_PARENT_LINKS
token->parent = parser->toksuper;
#endif
}
token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY);
token->start = parser->pos;
parser->toksuper = parser->toknext - 1;
break;
case '}':
case ']':
if (tokens == NULL) {
break;
}
type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY);
#ifdef JSMN_PARENT_LINKS
if (parser->toknext < 1) {
return JSMN_ERROR_INVAL;
}
token = &tokens[parser->toknext - 1];
for (;;) {
if (token->start != -1 && token->end == -1) {
if (token->type != type) {
return JSMN_ERROR_INVAL;
}
token->end = parser->pos + 1;
parser->toksuper = token->parent;
break;
}
if (token->parent == -1) {
if (token->type != type || parser->toksuper == -1) {
return JSMN_ERROR_INVAL;
}
break;
}
token = &tokens[token->parent];
}
#else
for (i = parser->toknext - 1; i >= 0; i--) {
token = &tokens[i];
if (token->start != -1 && token->end == -1) {
if (token->type != type) {
return JSMN_ERROR_INVAL;
}
parser->toksuper = -1;
token->end = parser->pos + 1;
break;
}
}
/* Error if unmatched closing bracket */
if (i == -1) {
return JSMN_ERROR_INVAL;
}
for (; i >= 0; i--) {
token = &tokens[i];
if (token->start != -1 && token->end == -1) {
parser->toksuper = i;
break;
}
}
#endif
break;
case '\"':
r = jsmn_parse_string(parser, js, len, tokens, num_tokens);
if (r < 0) {
return r;
}
count++;
if (parser->toksuper != -1 && tokens != NULL) {
tokens[parser->toksuper].size++;
}
break;
case '\t':
case '\r':
case '\n':
case ' ':
break;
case ':':
parser->toksuper = parser->toknext - 1;
break;
case ',':
if (tokens != NULL && parser->toksuper != -1 &&
tokens[parser->toksuper].type != JSMN_ARRAY &&
tokens[parser->toksuper].type != JSMN_OBJECT) {
#ifdef JSMN_PARENT_LINKS
parser->toksuper = tokens[parser->toksuper].parent;
#else
for (i = parser->toknext - 1; i >= 0; i--) {
if (tokens[i].type == JSMN_ARRAY ||
tokens[i].type == JSMN_OBJECT) {
if (tokens[i].start != -1 && tokens[i].end == -1) {
parser->toksuper = i;
break;
}
}
}
#endif
}
break;
#ifdef JSMN_STRICT
/* In strict mode primitives are: numbers and booleans */
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case 't':
case 'f':
case 'n':
/* And they must not be keys of the object */
if (tokens != NULL && parser->toksuper != -1) {
const jsmntok_t* t = &tokens[parser->toksuper];
if (t->type == JSMN_OBJECT ||
(t->type == JSMN_STRING && t->size != 0)) {
return JSMN_ERROR_INVAL;
}
}
#else
/* In non-strict mode every unquoted value is a primitive */
default:
#endif
r = jsmn_parse_primitive(parser, js, len, tokens, num_tokens);
if (r < 0) {
return r;
}
count++;
if (parser->toksuper != -1 && tokens != NULL) {
tokens[parser->toksuper].size++;
}
break;
#ifdef JSMN_STRICT
/* Unexpected char in strict mode */
default:
return JSMN_ERROR_INVAL;
#endif
}
}
if (tokens != NULL) {
for (i = parser->toknext - 1; i >= 0; i--) {
/* Unmatched opened object or array */
if (tokens[i].start != -1 && tokens[i].end == -1) {
return JSMN_ERROR_PART;
}
}
}
return count;
}
/**
* Creates a new parser based over a given buffer with an array of tokens
* available.
*/
JSMN_API void
jsmn_init(jsmn_parser* parser) {
parser->pos = 0;
parser->toknext = 0;
parser->toksuper = -1;
}
#endif /* JSMN_HEADER */
#ifdef __cplusplus
}
#endif
#endif /* JSMN_H */

View File

@ -176,6 +176,7 @@ class QueryContext : public Context {
const milvus::segcore::SegmentInternalInterface* segment, const milvus::segcore::SegmentInternalInterface* segment,
int64_t active_count, int64_t active_count,
milvus::Timestamp timestamp, milvus::Timestamp timestamp,
int32_t consistency_level = 0,
std::shared_ptr<QueryConfig> query_config = std::shared_ptr<QueryConfig> query_config =
std::make_shared<QueryConfig>(), std::make_shared<QueryConfig>(),
folly::Executor* executor = nullptr, folly::Executor* executor = nullptr,
@ -187,7 +188,8 @@ class QueryContext : public Context {
active_count_(active_count), active_count_(active_count),
query_timestamp_(timestamp), query_timestamp_(timestamp),
query_config_(query_config), query_config_(query_config),
executor_(executor) { executor_(executor),
consistency_level_(consistency_level) {
} }
folly::Executor* folly::Executor*
@ -270,6 +272,11 @@ class QueryContext : public Context {
return std::move(retrieve_result_); return std::move(retrieve_result_);
} }
int32_t
get_consistency_level() {
return consistency_level_;
}
private: private:
folly::Executor* executor_; folly::Executor* executor_;
//folly::Executor::KeepAlive<> executor_keepalive_; //folly::Executor::KeepAlive<> executor_keepalive_;
@ -291,6 +298,8 @@ class QueryContext : public Context {
// used for store segment search/retrieve result // used for store segment search/retrieve result
milvus::SearchResult search_result_; milvus::SearchResult search_result_;
milvus::RetrieveResult retrieve_result_; milvus::RetrieveResult retrieve_result_;
int32_t consistency_level_ = 0;
}; };
// Represent the state of one thread of query execution. // Represent the state of one thread of query execution.

View File

@ -449,7 +449,8 @@ class PhyBinaryArithOpEvalRangeExpr : public SegmentExpr {
const std::string& name, const std::string& name,
const segcore::SegmentInternalInterface* segment, const segcore::SegmentInternalInterface* segment,
int64_t active_count, int64_t active_count,
int64_t batch_size) int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input), : SegmentExpr(std::move(input),
name, name,
segment, segment,
@ -457,7 +458,8 @@ class PhyBinaryArithOpEvalRangeExpr : public SegmentExpr {
expr->column_.nested_path_, expr->column_.nested_path_,
DataType::NONE, DataType::NONE,
active_count, active_count,
batch_size), batch_size,
consistency_level),
expr_(expr) { expr_(expr) {
} }

View File

@ -385,6 +385,9 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(EvalCtx& context) {
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
FieldId field_id = expr_->column_.field_id_; FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecRangeVisitorImplForJsonForIndex<ValueType>();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -505,6 +508,246 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(EvalCtx& context) {
return res_vec; return res_vec;
} }
template <typename ValueType>
VectorPtr
PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJsonForIndex() {
using GetType = std::conditional_t<std::is_same_v<ValueType, std::string>,
std::string_view,
ValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
#define BinaryRangeJSONIndexCompare(cmp) \
do { \
auto val = json.at<GetType>(offset, size); \
if (val.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto val = json.at<double>(offset, size); \
return !val.error() && (cmp); \
} \
return false; \
} \
return (cmp); \
} while (false)
#define BinaryRangeJSONTypeCompare(cmp) \
do { \
if constexpr (std::is_same_v<GetType, std::string_view>) { \
if (type == uint8_t(milvus::index::JSONType::STRING)) { \
auto val = json.at_string(offset, size); \
return (cmp); \
} else { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
if (type == uint8_t(milvus::index::JSONType::INT64)) { \
auto val = \
std::stoll(std::string(json.at_string(offset, size))); \
return (cmp); \
} else if (type == uint8_t(milvus::index::JSONType::DOUBLE)) { \
auto val = \
std::stod(std::string(json.at_string(offset, size))); \
return (cmp); \
} else { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, int64_t>) { \
if (type == uint8_t(milvus::index::JSONType::INT64)) { \
auto val = \
std::stoll(std::string(json.at_string(offset, size))); \
return (cmp); \
} else if (type == uint8_t(milvus::index::JSONType::DOUBLE)) { \
auto val = \
std::stod(std::string(json.at_string(offset, size))); \
return (cmp); \
} else { \
return false; \
} \
} \
} while (false)
#define BinaryRangeJSONTypeCompareWithValue(cmp) \
do { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
float val = *reinterpret_cast<float*>(&value); \
return (cmp); \
} else { \
int64_t val = value; \
return (cmp); \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
float val = *reinterpret_cast<float*>(&value); \
return (cmp); \
} else { \
int64_t val = value; \
return (cmp); \
} \
} else if constexpr (std::is_same_v<GetType, bool>) { \
bool val = *reinterpret_cast<bool*>(&value); \
return (cmp); \
} \
} while (false)
bool lower_inclusive = expr_->lower_inclusive_;
bool upper_inclusive = expr_->upper_inclusive_;
ValueType val1 = GetValueFromProto<ValueType>(expr_->lower_val_);
ValueType val2 = GetValueFromProto<ValueType>(expr_->upper_val_);
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment,
&field_id,
val1,
val2,
lower_inclusive,
upper_inclusive](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
if constexpr (std::is_same_v<GetType, int64_t>) {
if (type != uint8_t(milvus::index::JSONType::INT32) &&
type != uint8_t(milvus::index::JSONType::INT64) &&
type != uint8_t(milvus::index::JSONType::FLOAT) &&
type != uint8_t(milvus::index::JSONType::DOUBLE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType,
std::string_view>) {
if (type != uint8_t(milvus::index::JSONType::STRING) &&
type !=
uint8_t(milvus::index::JSONType::STRING_ESCAPE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType, double>) {
if (type != uint8_t(milvus::index::JSONType::INT32) &&
type != uint8_t(milvus::index::JSONType::INT64) &&
type != uint8_t(milvus::index::JSONType::FLOAT) &&
type != uint8_t(milvus::index::JSONType::DOUBLE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType, bool>) {
if (type != uint8_t(milvus::index::JSONType::BOOL)) {
return false;
}
}
if (lower_inclusive && upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::FLOAT)) {
BinaryRangeJSONTypeCompareWithValue(
static_cast<float>(val1) <= val &&
val <= static_cast<float>(val2));
} else {
BinaryRangeJSONTypeCompareWithValue(val1 <= val &&
val <= val2);
}
} else if (lower_inclusive && !upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::FLOAT)) {
BinaryRangeJSONTypeCompareWithValue(
static_cast<float>(val1) <= val &&
val < static_cast<float>(val2));
} else {
BinaryRangeJSONTypeCompareWithValue(val1 <= val &&
val < val2);
}
} else if (!lower_inclusive && upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::FLOAT)) {
BinaryRangeJSONTypeCompareWithValue(
static_cast<float>(val1) < val &&
val <= static_cast<float>(val2));
} else {
BinaryRangeJSONTypeCompareWithValue(val1 < val &&
val <= val2);
}
} else {
if (type == uint8_t(milvus::index::JSONType::FLOAT)) {
BinaryRangeJSONTypeCompareWithValue(
static_cast<float>(val1) < val &&
val < static_cast<float>(val2));
} else {
BinaryRangeJSONTypeCompareWithValue(val1 < val &&
val < val2);
}
}
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
if (lower_inclusive && upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
BinaryRangeJSONTypeCompare(val1 <= val && val <= val2);
} else {
BinaryRangeJSONIndexCompare(
val1 <= ValueType(val.value()) &&
ValueType(val.value()) <= val2);
}
} else if (lower_inclusive && !upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
BinaryRangeJSONTypeCompare(val1 <= val && val < val2);
} else {
BinaryRangeJSONIndexCompare(
val1 <= ValueType(val.value()) &&
ValueType(val.value()) < val2);
}
} else if (!lower_inclusive && upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
BinaryRangeJSONTypeCompare(val1 < val && val <= val2);
} else {
BinaryRangeJSONIndexCompare(
val1 < ValueType(val.value()) &&
ValueType(val.value()) <= val2);
}
} else {
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
BinaryRangeJSONTypeCompare(val1 < val && val < val2);
} else {
BinaryRangeJSONIndexCompare(
val1 < ValueType(val.value()) &&
ValueType(val.value()) < val2);
}
}
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
template <typename ValueType> template <typename ValueType>
VectorPtr VectorPtr
PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray(EvalCtx& context) { PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray(EvalCtx& context) {

View File

@ -245,7 +245,8 @@ class PhyBinaryRangeFilterExpr : public SegmentExpr {
const std::string& name, const std::string& name,
const segcore::SegmentInternalInterface* segment, const segcore::SegmentInternalInterface* segment,
int64_t active_count, int64_t active_count,
int64_t batch_size) int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input), : SegmentExpr(std::move(input),
name, name,
segment, segment,
@ -253,7 +254,8 @@ class PhyBinaryRangeFilterExpr : public SegmentExpr {
expr->column_.nested_path_, expr->column_.nested_path_,
DataType::NONE, DataType::NONE,
active_count, active_count,
batch_size), batch_size,
consistency_level),
expr_(expr) { expr_(expr) {
} }
@ -308,6 +310,10 @@ class PhyBinaryRangeFilterExpr : public SegmentExpr {
VectorPtr VectorPtr
ExecRangeVisitorImplForJson(EvalCtx& context); ExecRangeVisitorImplForJson(EvalCtx& context);
template <typename ValueType>
VectorPtr
ExecRangeVisitorImplForJsonForIndex();
template <typename ValueType> template <typename ValueType>
VectorPtr VectorPtr
ExecRangeVisitorImplForArray(EvalCtx& context); ExecRangeVisitorImplForArray(EvalCtx& context);

View File

@ -98,6 +98,9 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(EvalCtx& context) {
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_; FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return EvalJsonExistsForDataSegmentForIndex();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -159,5 +162,49 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(EvalCtx& context) {
return res_vec; return res_vec;
} }
VectorPtr
PhyExistsFilterExpr::EvalJsonExistsForDataSegmentForIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, field_id, pointer](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
uint32_t value) {
return true;
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
} //namespace exec } //namespace exec
} // namespace milvus } // namespace milvus

View File

@ -42,7 +42,8 @@ class PhyExistsFilterExpr : public SegmentExpr {
const std::string& name, const std::string& name,
const segcore::SegmentInternalInterface* segment, const segcore::SegmentInternalInterface* segment,
int64_t active_count, int64_t active_count,
int64_t batch_size) int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input), : SegmentExpr(std::move(input),
name, name,
segment, segment,
@ -51,7 +52,8 @@ class PhyExistsFilterExpr : public SegmentExpr {
DataType::NONE, DataType::NONE,
active_count, active_count,
batch_size, batch_size,
true), true,
consistency_level),
expr_(expr) { expr_(expr) {
} }
@ -80,6 +82,9 @@ class PhyExistsFilterExpr : public SegmentExpr {
VectorPtr VectorPtr
EvalJsonExistsForIndex(); EvalJsonExistsForIndex();
VectorPtr
EvalJsonExistsForDataSegmentForIndex();
private: private:
std::shared_ptr<const milvus::expr::ExistsExpr> expr_; std::shared_ptr<const milvus::expr::ExistsExpr> expr_;
}; };

View File

@ -154,7 +154,6 @@ CompileExpression(const expr::TypedExprPtr& expr,
const std::unordered_set<std::string>& flatten_candidates, const std::unordered_set<std::string>& flatten_candidates,
bool enable_constant_folding) { bool enable_constant_folding) {
ExprPtr result; ExprPtr result;
auto compiled_inputs = CompileInputs(expr, context, flatten_candidates); auto compiled_inputs = CompileInputs(expr, context, flatten_candidates);
auto GetTypes = [](const std::vector<ExprPtr>& exprs) { auto GetTypes = [](const std::vector<ExprPtr>& exprs) {
@ -183,7 +182,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyUnaryRangeFilterExpr", "PhyUnaryRangeFilterExpr",
context->get_segment(), context->get_segment(),
context->get_active_count(), context->get_active_count(),
context->query_config()->get_expr_batch_size()); context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = std::dynamic_pointer_cast< } else if (auto casted_expr = std::dynamic_pointer_cast<
const milvus::expr::LogicalUnaryExpr>(expr)) { const milvus::expr::LogicalUnaryExpr>(expr)) {
result = std::make_shared<PhyLogicalUnaryExpr>( result = std::make_shared<PhyLogicalUnaryExpr>(
@ -197,7 +197,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
context->get_segment(), context->get_segment(),
context->get_active_count(), context->get_active_count(),
context->get_query_timestamp(), context->get_query_timestamp(),
context->query_config()->get_expr_batch_size()); context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = std::dynamic_pointer_cast< } else if (auto casted_expr = std::dynamic_pointer_cast<
const milvus::expr::LogicalBinaryExpr>(expr)) { const milvus::expr::LogicalBinaryExpr>(expr)) {
if (casted_expr->op_type_ == if (casted_expr->op_type_ ==
@ -220,7 +221,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyBinaryRangeFilterExpr", "PhyBinaryRangeFilterExpr",
context->get_segment(), context->get_segment(),
context->get_active_count(), context->get_active_count(),
context->query_config()->get_expr_batch_size()); context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = std::dynamic_pointer_cast< } else if (auto casted_expr = std::dynamic_pointer_cast<
const milvus::expr::AlwaysTrueExpr>(expr)) { const milvus::expr::AlwaysTrueExpr>(expr)) {
result = std::make_shared<PhyAlwaysTrueExpr>( result = std::make_shared<PhyAlwaysTrueExpr>(
@ -238,7 +240,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyBinaryArithOpEvalRangeExpr", "PhyBinaryArithOpEvalRangeExpr",
context->get_segment(), context->get_segment(),
context->get_active_count(), context->get_active_count(),
context->query_config()->get_expr_batch_size()); context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = } else if (auto casted_expr =
std::dynamic_pointer_cast<const milvus::expr::CompareExpr>( std::dynamic_pointer_cast<const milvus::expr::CompareExpr>(
expr)) { expr)) {
@ -258,7 +261,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyExistsFilterExpr", "PhyExistsFilterExpr",
context->get_segment(), context->get_segment(),
context->get_active_count(), context->get_active_count(),
context->query_config()->get_expr_batch_size()); context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = std::dynamic_pointer_cast< } else if (auto casted_expr = std::dynamic_pointer_cast<
const milvus::expr::JsonContainsExpr>(expr)) { const milvus::expr::JsonContainsExpr>(expr)) {
result = std::make_shared<PhyJsonContainsFilterExpr>( result = std::make_shared<PhyJsonContainsFilterExpr>(
@ -267,7 +271,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyJsonContainsFilterExpr", "PhyJsonContainsFilterExpr",
context->get_segment(), context->get_segment(),
context->get_active_count(), context->get_active_count(),
context->query_config()->get_expr_batch_size()); context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto value_expr = } else if (auto value_expr =
std::dynamic_pointer_cast<const milvus::expr::ValueExpr>( std::dynamic_pointer_cast<const milvus::expr::ValueExpr>(
expr)) { expr)) {
@ -298,7 +303,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyNullExpr", "PhyNullExpr",
context->get_segment(), context->get_segment(),
context->get_active_count(), context->get_active_count(),
context->query_config()->get_expr_batch_size()); context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else { } else {
PanicInfo(ExprInvalid, "unsupport expr: ", expr->ToString()); PanicInfo(ExprInvalid, "unsupport expr: ", expr->ToString());
} }
@ -481,7 +487,8 @@ ConvertMultiOrToInExpr(std::vector<std::shared_ptr<Expr>>& exprs,
query_context->get_segment(), query_context->get_segment(),
query_context->get_active_count(), query_context->get_active_count(),
query_context->get_query_timestamp(), query_context->get_query_timestamp(),
query_context->query_config()->get_expr_batch_size()); query_context->query_config()->get_expr_batch_size(),
query_context->get_consistency_level());
} }
inline void inline void

View File

@ -31,7 +31,9 @@
#include "expr/ITypeExpr.h" #include "expr/ITypeExpr.h"
#include "log/Log.h" #include "log/Log.h"
#include "query/PlanProto.h" #include "query/PlanProto.h"
#include "segcore/SegmentSealedImpl.h"
#include "segcore/SegmentInterface.h"
#include "segcore/SegmentGrowingImpl.h"
namespace milvus { namespace milvus {
namespace exec { namespace exec {
@ -138,7 +140,9 @@ class SegmentExpr : public Expr {
const DataType value_type, const DataType value_type,
int64_t active_count, int64_t active_count,
int64_t batch_size, int64_t batch_size,
int32_t consistency_level,
bool allow_any_json_cast_type = false) bool allow_any_json_cast_type = false)
: Expr(DataType::BOOL, std::move(input), name), : Expr(DataType::BOOL, std::move(input), name),
segment_(segment), segment_(segment),
field_id_(field_id), field_id_(field_id),
@ -146,7 +150,8 @@ class SegmentExpr : public Expr {
value_type_(value_type), value_type_(value_type),
allow_any_json_cast_type_(allow_any_json_cast_type), allow_any_json_cast_type_(allow_any_json_cast_type),
active_count_(active_count), active_count_(active_count),
batch_size_(batch_size) { batch_size_(batch_size),
consistency_level_(consistency_level) {
size_per_chunk_ = segment_->size_per_chunk(); size_per_chunk_ = segment_->size_per_chunk();
AssertInfo( AssertInfo(
batch_size_ > 0, batch_size_ > 0,
@ -1219,6 +1224,23 @@ class SegmentExpr : public Expr {
use_index_ = false; use_index_ = false;
} }
bool
CanUseJsonKeyIndex(FieldId field_id) const {
if (segment_->type() == SegmentType::Sealed) {
auto sealed_seg =
dynamic_cast<const segcore::SegmentSealed*>(segment_);
Assert(sealed_seg != nullptr);
if (sealed_seg->GetJsonKeyIndex(field_id) != nullptr) {
return true;
}
} else if (segment_->type() == SegmentType ::Growing) {
if (segment_->GetJsonKeyIndex(field_id) != nullptr) {
return true;
}
}
return false;
}
protected: protected:
const segcore::SegmentInternalInterface* segment_; const segcore::SegmentInternalInterface* segment_;
const FieldId field_id_; const FieldId field_id_;
@ -1255,6 +1277,7 @@ class SegmentExpr : public Expr {
// Cache for text match. // Cache for text match.
std::shared_ptr<TargetBitmap> cached_match_res_{nullptr}; std::shared_ptr<TargetBitmap> cached_match_res_{nullptr};
int32_t consistency_level_{0};
}; };
bool bool

View File

@ -259,6 +259,11 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) {
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsByKeyIndex<ExprValueType>();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -349,10 +354,99 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) {
return res_vec; return res_vec;
} }
template <typename ExprValueType>
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsByKeyIndex() {
using GetType =
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::unordered_set<GetType> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
if (!arg_inited_) {
arg_set_ = std::make_shared<SortVectorElement<GetType>>(expr_->vals_);
arg_inited_ = true;
}
if (arg_set_->Empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [this, segment, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
continue;
}
if (this->arg_set_->In(val.value())) {
return true;
}
}
return false;
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) { PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) {
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsArrayByKeyIndex();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -452,6 +546,85 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) {
return res_vec; return res_vec;
} }
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsArrayByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::vector<proto::plan::Array> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
for (auto const& element : expr_->vals_) {
elements.emplace_back(GetValueFromProto<proto::plan::Array>(element));
}
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
for (auto&& it : array) {
auto val = it.get_array();
if (val.error()) {
continue;
}
for (auto const& element : elements) {
if (CompareTwoJsonArray(val, element)) {
return true;
}
}
}
return false;
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
template <typename ExprValueType> template <typename ExprValueType>
VectorPtr VectorPtr
PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) { PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) {
@ -519,7 +692,6 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) {
} }
processed_cursor += size; processed_cursor += size;
}; };
int64_t processed_size; int64_t processed_size;
if (has_offset_input_) { if (has_offset_input_) {
processed_size = processed_size =
@ -550,6 +722,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) {
ExprValueType>; ExprValueType>;
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsAllByKeyIndex<ExprValueType>();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -643,10 +820,98 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) {
return res_vec; return res_vec;
} }
template <typename ExprValueType>
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllByKeyIndex() {
using GetType =
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::set<GetType> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
for (auto const& element : expr_->vals_) {
elements.insert(GetValueFromProto<GetType>(element));
}
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
std::set<GetType> tmp_elements(elements);
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
continue;
}
tmp_elements.erase(val.value());
if (tmp_elements.size() == 0) {
return true;
}
}
return tmp_elements.empty();
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) { PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsAllWithDiffTypeByKeyIndex();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -805,10 +1070,157 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
return res_vec; return res_vec;
} }
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto elements = expr_->vals_;
std::set<int> elements_index;
int i = 0;
for (auto& element : elements) {
elements_index.insert(i);
i++;
}
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &elements_index, &field_id](
bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
std::set<int> tmp_elements_index(elements_index);
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
for (auto&& it : array) {
int i = -1;
for (auto& element : elements) {
i++;
switch (element.val_case()) {
case proto::plan::GenericValue::kBoolVal: {
auto val = it.template get<bool>();
if (val.error()) {
continue;
}
if (val.value() == element.bool_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
if (val.value() == element.int64_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kFloatVal: {
auto val = it.template get<double>();
if (val.error()) {
continue;
}
if (val.value() == element.float_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kStringVal: {
auto val = it.template get<std::string_view>();
if (val.error()) {
continue;
}
if (val.value() == element.string_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kArrayVal: {
auto val = it.get_array();
if (val.error()) {
continue;
}
if (CompareTwoJsonArray(val,
element.array_val())) {
tmp_elements_index.erase(i);
}
break;
}
default:
PanicInfo(
DataTypeInvalid,
fmt::format("unsupported data type {}",
element.val_case()));
}
if (tmp_elements_index.size() == 0) {
return true;
}
}
if (tmp_elements_index.size() == 0) {
return true;
}
}
return tmp_elements_index.size() == 0;
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) { PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) {
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsAllArrayByKeyIndex();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -914,10 +1326,97 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) {
return res_vec; return res_vec;
} }
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllArrayByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
std::vector<proto::plan::Array> elements;
for (auto const& element : expr_->vals_) {
elements.emplace_back(GetValueFromProto<proto::plan::Array>(element));
}
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
std::set<int> exist_elements_index;
for (auto&& it : array) {
auto json_array = it.get_array();
if (json_array.error()) {
continue;
}
for (int index = 0; index < elements.size(); ++index) {
if (CompareTwoJsonArray(json_array, elements[index])) {
exist_elements_index.insert(index);
}
}
if (exist_elements_index.size() == elements.size()) {
return true;
}
}
return exist_elements_index.size() == elements.size();
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) { PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsWithDiffTypeByKeyIndex();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -1066,6 +1565,134 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
return res_vec; return res_vec;
} }
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto elements = expr_->vals_;
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
// Note: array can only be iterated once
for (auto&& it : array) {
for (auto const& element : elements) {
switch (element.val_case()) {
case proto::plan::GenericValue::kBoolVal: {
auto val = it.template get<bool>();
if (val.error()) {
continue;
}
if (val.value() == element.bool_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
if (val.value() == element.int64_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kFloatVal: {
auto val = it.template get<double>();
if (val.error()) {
continue;
}
if (val.value() == element.float_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kStringVal: {
auto val = it.template get<std::string_view>();
if (val.error()) {
continue;
}
if (val.value() == element.string_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kArrayVal: {
auto val = it.get_array();
if (val.error()) {
continue;
}
if (CompareTwoJsonArray(val,
element.array_val())) {
return true;
}
break;
}
default:
PanicInfo(
DataTypeInvalid,
fmt::format("unsupported data type {}",
element.val_case()));
}
}
}
return false;
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr VectorPtr
PhyJsonContainsFilterExpr::EvalArrayContainsForIndexSegment() { PhyJsonContainsFilterExpr::EvalArrayContainsForIndexSegment() {
switch (expr_->column_.element_type_) { switch (expr_->column_.element_type_) {

View File

@ -36,7 +36,8 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
const std::string& name, const std::string& name,
const segcore::SegmentInternalInterface* segment, const segcore::SegmentInternalInterface* segment,
int64_t active_count, int64_t active_count,
int64_t batch_size) int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input), : SegmentExpr(std::move(input),
name, name,
segment, segment,
@ -44,7 +45,8 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
expr->column_.nested_path_, expr->column_.nested_path_,
DataType::NONE, DataType::NONE,
active_count, active_count,
batch_size), batch_size,
consistency_level),
expr_(expr) { expr_(expr) {
} }
@ -74,6 +76,10 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
VectorPtr VectorPtr
ExecJsonContains(EvalCtx& context); ExecJsonContains(EvalCtx& context);
template <typename ExprValueType>
VectorPtr
ExecJsonContainsByKeyIndex();
template <typename ExprValueType> template <typename ExprValueType>
VectorPtr VectorPtr
ExecArrayContains(EvalCtx& context); ExecArrayContains(EvalCtx& context);
@ -82,6 +88,10 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
VectorPtr VectorPtr
ExecJsonContainsAll(EvalCtx& context); ExecJsonContainsAll(EvalCtx& context);
template <typename ExprValueType>
VectorPtr
ExecJsonContainsAllByKeyIndex();
template <typename ExprValueType> template <typename ExprValueType>
VectorPtr VectorPtr
ExecArrayContainsAll(EvalCtx& context); ExecArrayContainsAll(EvalCtx& context);
@ -89,15 +99,27 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
VectorPtr VectorPtr
ExecJsonContainsArray(EvalCtx& context); ExecJsonContainsArray(EvalCtx& context);
VectorPtr
ExecJsonContainsArrayByKeyIndex();
VectorPtr VectorPtr
ExecJsonContainsAllArray(EvalCtx& context); ExecJsonContainsAllArray(EvalCtx& context);
VectorPtr
ExecJsonContainsAllArrayByKeyIndex();
VectorPtr VectorPtr
ExecJsonContainsAllWithDiffType(EvalCtx& context); ExecJsonContainsAllWithDiffType(EvalCtx& context);
VectorPtr
ExecJsonContainsAllWithDiffTypeByKeyIndex();
VectorPtr VectorPtr
ExecJsonContainsWithDiffType(EvalCtx& context); ExecJsonContainsWithDiffType(EvalCtx& context);
VectorPtr
ExecJsonContainsWithDiffTypeByKeyIndex();
VectorPtr VectorPtr
EvalArrayContainsForIndexSegment(); EvalArrayContainsForIndexSegment();

View File

@ -35,7 +35,8 @@ class PhyNullExpr : public SegmentExpr {
const std::string& name, const std::string& name,
const segcore::SegmentInternalInterface* segment, const segcore::SegmentInternalInterface* segment,
int64_t active_count, int64_t active_count,
int64_t batch_size) int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input), : SegmentExpr(std::move(input),
name, name,
segment, segment,
@ -43,7 +44,8 @@ class PhyNullExpr : public SegmentExpr {
expr->column_.nested_path_, expr->column_.nested_path_,
DataType::NONE, DataType::NONE,
active_count, active_count,
batch_size), batch_size,
consistency_level),
expr_(expr) { expr_(expr) {
} }

View File

@ -539,6 +539,153 @@ PhyTermFilterExpr::ExecTermJsonVariableInField(EvalCtx& context) {
return res_vec; return res_vec;
} }
template <typename ValueType>
VectorPtr
PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() {
using GetType = std::conditional_t<std::is_same_v<ValueType, std::string>,
std::string_view,
ValueType>;
auto real_batch_size = GetNextBatchSize();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
if (!arg_inited_) {
arg_set_ = std::make_shared<SortVectorElement<ValueType>>(expr_->vals_);
if constexpr (std::is_same_v<GetType, double>) {
arg_set_float_ =
std::make_shared<SortVectorElement<float>>(expr_->vals_);
}
arg_inited_ = true;
}
if (arg_set_->Empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
auto vals = expr_->vals_;
Assert(index != nullptr);
auto filter_func = [this, segment, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
if constexpr (std::is_same_v<GetType, int64_t>) {
if (type != uint8_t(milvus::index::JSONType::INT32) &&
type != uint8_t(milvus::index::JSONType::INT64) &&
type != uint8_t(milvus::index::JSONType::FLOAT) &&
type != uint8_t(milvus::index::JSONType::DOUBLE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType,
std::string_view>) {
if (type != uint8_t(milvus::index::JSONType::STRING) &&
type !=
uint8_t(milvus::index::JSONType::STRING_ESCAPE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType, double>) {
if (type != uint8_t(milvus::index::JSONType::INT32) &&
type != uint8_t(milvus::index::JSONType::INT64) &&
type != uint8_t(milvus::index::JSONType::FLOAT) &&
type != uint8_t(milvus::index::JSONType::DOUBLE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType, bool>) {
if (type != uint8_t(milvus::index::JSONType::BOOL)) {
return false;
}
}
if constexpr (std::is_same_v<GetType, int64_t>) {
return this->arg_set_->In(value);
} else if constexpr (std::is_same_v<GetType, double>) {
float restoredValue = *reinterpret_cast<float*>(&value);
return this->arg_set_float_->In(restoredValue);
} else if constexpr (std::is_same_v<GetType, bool>) {
bool restoredValue = *reinterpret_cast<bool*>(&value);
return this->arg_set_->In(restoredValue);
}
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
if (type == uint8_t(milvus::index::JSONType::STRING)) {
if constexpr (std::is_same_v<GetType,
std::string_view>) {
auto val = json.at_string(offset, size);
return this->arg_set_->In(ValueType(val));
} else {
return false;
}
} else if (type ==
uint8_t(milvus::index::JSONType::DOUBLE)) {
if constexpr (std::is_same_v<GetType, double>) {
auto val = std::stod(
std::string(json.at_string(offset, size)));
return this->arg_set_->In(ValueType(val));
} else {
return false;
}
} else if (type ==
uint8_t(milvus::index::JSONType::INT64)) {
if constexpr (std::is_same_v<GetType, int64_t>) {
auto val = std::stoll(
std::string(json.at_string(offset, size)));
return this->arg_set_->In(ValueType(val));
} else {
return false;
}
}
} else {
auto val = json.at<GetType>(offset, size);
if (val.error()) {
return false;
}
return this->arg_set_->In(ValueType(val.value()));
}
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
template <typename ValueType> template <typename ValueType>
VectorPtr VectorPtr
PhyTermFilterExpr::ExecTermJsonFieldInVariable(EvalCtx& context) { PhyTermFilterExpr::ExecTermJsonFieldInVariable(EvalCtx& context) {
@ -548,6 +695,9 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable(EvalCtx& context) {
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_; FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonInVariableByKeyIndex<ValueType>();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();

View File

@ -57,7 +57,8 @@ class PhyTermFilterExpr : public SegmentExpr {
const segcore::SegmentInternalInterface* segment, const segcore::SegmentInternalInterface* segment,
int64_t active_count, int64_t active_count,
milvus::Timestamp timestamp, milvus::Timestamp timestamp,
int64_t batch_size) int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input), : SegmentExpr(std::move(input),
name, name,
segment, segment,
@ -67,7 +68,8 @@ class PhyTermFilterExpr : public SegmentExpr {
? DataType::NONE ? DataType::NONE
: FromValCase(expr->vals_[0].val_case()), : FromValCase(expr->vals_[0].val_case()),
active_count, active_count,
batch_size), batch_size,
consistency_level),
expr_(expr), expr_(expr),
query_timestamp_(timestamp) { query_timestamp_(timestamp) {
} }
@ -137,6 +139,10 @@ class PhyTermFilterExpr : public SegmentExpr {
VectorPtr VectorPtr
ExecTermArrayFieldInVariable(EvalCtx& context); ExecTermArrayFieldInVariable(EvalCtx& context);
template <typename ValueType>
VectorPtr
ExecJsonInVariableByKeyIndex();
private: private:
std::shared_ptr<const milvus::expr::TermFilterExpr> expr_; std::shared_ptr<const milvus::expr::TermFilterExpr> expr_;
milvus::Timestamp query_timestamp_; milvus::Timestamp query_timestamp_;
@ -144,7 +150,9 @@ class PhyTermFilterExpr : public SegmentExpr {
TargetBitmap cached_bits_; TargetBitmap cached_bits_;
bool arg_inited_{false}; bool arg_inited_{false};
std::shared_ptr<MultiElement> arg_set_; std::shared_ptr<MultiElement> arg_set_;
std::shared_ptr<MultiElement> arg_set_float_;
SingleElement arg_val_; SingleElement arg_val_;
int32_t consistency_level_ = 0;
}; };
} //namespace exec } //namespace exec
} // namespace milvus } // namespace milvus

View File

@ -21,9 +21,9 @@
#include "common/type_c.h" #include "common/type_c.h"
#include "log/Log.h" #include "log/Log.h"
#include <boost/regex.hpp>
namespace milvus { namespace milvus {
namespace exec { namespace exec {
template <typename T> template <typename T>
bool bool
PhyUnaryRangeFilterExpr::CanUseIndexForArray() { PhyUnaryRangeFilterExpr::CanUseIndexForArray() {
@ -617,6 +617,11 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) {
auto* input = context.get_offset_input(); auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input(); const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_; FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecRangeVisitorImplJsonForIndex<ExprValueType>();
}
auto real_batch_size = auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize(); has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) { if (real_batch_size == 0) {
@ -898,6 +903,506 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) {
return res_vec; return res_vec;
} }
std::pair<std::string, std::string>
PhyUnaryRangeFilterExpr::SplitAtFirstSlashDigit(std::string input) {
boost::regex rgx("/\\d+");
boost::smatch match;
if (boost::regex_search(input, match, rgx)) {
std::string firstPart = input.substr(0, match.position());
std::string secondPart = input.substr(match.position());
return {firstPart, secondPart};
} else {
return {input, ""};
}
}
template <typename ExprValueType>
VectorPtr
PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonForIndex() {
using GetType =
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointerpath = milvus::Json::pointer(expr_->column_.nested_path_);
auto pointerpair = SplitAtFirstSlashDigit(pointerpath);
std::string pointer = pointerpair.first;
std::string arrayIndex = pointerpair.second;
#define UnaryRangeJSONIndexCompare(cmp) \
do { \
auto x = json.at<GetType>(offset, size); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.at<double>(offset, size); \
return !x.error() && (cmp); \
} \
return false; \
} \
return (cmp); \
} while (false)
#define UnaryJSONTypeCompare(cmp) \
do { \
if constexpr (std::is_same_v<GetType, std::string_view>) { \
if (type == uint8_t(milvus::index::JSONType::STRING)) { \
auto x = json.at_string(offset, size); \
return (cmp); \
} else { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
if (type == uint8_t(milvus::index::JSONType::INT64)) { \
auto x = \
std::stoll(std::string(json.at_string(offset, size))); \
return (cmp); \
} else if (type == uint8_t(milvus::index::JSONType::DOUBLE)) { \
auto x = std::stod(std::string(json.at_string(offset, size))); \
return (cmp); \
} else { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, int64_t>) { \
if (type == uint8_t(milvus::index::JSONType::INT64)) { \
auto x = \
std::stoll(std::string(json.at_string(offset, size))); \
return (cmp); \
} else if (type == uint8_t(milvus::index::JSONType::DOUBLE)) { \
auto x = std::stod(std::string(json.at_string(offset, size))); \
return (cmp); \
} else { \
return false; \
} \
} \
} while (false)
#define UnaryJSONTypeCompareWithValue(cmp) \
do { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
float x = *reinterpret_cast<float*>(&value); \
return (cmp); \
} else { \
int64_t x = value; \
return (cmp); \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
float x = *reinterpret_cast<float*>(&value); \
return (cmp); \
} else { \
int64_t x = value; \
return (cmp); \
} \
} else if constexpr (std::is_same_v<GetType, bool>) { \
bool x = *reinterpret_cast<bool*>(&value); \
return (cmp); \
} \
} while (false)
#define CompareValueWithOpType(type, value, val, op_type) \
switch (op_type) { \
case proto::plan::GreaterThan: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x > static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x > val); \
} \
break; \
case proto::plan::GreaterEqual: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x >= static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x >= val); \
} \
break; \
case proto::plan::LessThan: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x < static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x < val); \
} \
break; \
case proto::plan::LessEqual: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x <= static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x <= val); \
} \
break; \
case proto::plan::Equal: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x == static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x == val); \
} \
break; \
case proto::plan::NotEqual: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x != static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x != val); \
} \
break; \
default: \
return false; \
}
#define UnaryRangeJSONIndexCompareWithArrayIndex(cmp) \
do { \
if (type != uint8_t(milvus::index::JSONType::UNKNOWN)) { \
return false; \
} \
auto array = json.array_at(offset, size); \
if (array.error()) { \
return false; \
} \
auto value = array.at_pointer(arrayIndex); \
if (value.error()) { \
return false; \
} \
if constexpr (std::is_same_v<GetType, int64_t> || \
std::is_same_v<GetType, double>) { \
if (!value.is_number()) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, std::string_view>) { \
if (!value.is_string()) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, bool>) { \
if (!value.is_bool()) { \
return false; \
} \
} \
auto x = value.get<GetType>(); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = value.get<double>(); \
return !x.error() && (cmp); \
} \
} \
return (cmp); \
} while (false)
#define UnaryRangeJSONIndexCompareNotEqual(cmp) \
do { \
auto x = json.at<GetType>(offset, size); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.at<double>(offset, size); \
return x.error() || (cmp); \
} \
return true; \
} \
return (cmp); \
} while (false)
#define UnaryRangeJSONIndexCompareNotEqualWithArrayIndex(cmp) \
do { \
auto array = json.array_at(offset, size); \
if (array.error()) { \
return false; \
} \
auto value = array.at_pointer(arrayIndex); \
if (value.error()) { \
return false; \
} \
if constexpr (std::is_same_v<GetType, int64_t> || \
std::is_same_v<GetType, double>) { \
if (!value.is_number()) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, std::string_view>) { \
if (!value.is_string()) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, bool>) { \
if (!value.is_bool()) { \
return false; \
} \
} \
auto x = value.get<GetType>(); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = value.get<double>(); \
return x.error() || (cmp); \
} \
} \
return (cmp); \
} while (false)
#define CHECKISJSONTYPEWITHOFFSET(type) \
(type == uint8_t(milvus::index::JSONType::STRING) || \
type == uint8_t(milvus::index::JSONType::DOUBLE) || \
type == uint8_t(milvus::index::JSONType::INT64))
#define CHECKJSONTYPEISNUMBER(type) \
if ((type != uint8_t(milvus::index::JSONType::INT32)) && \
(type != uint8_t(milvus::index::JSONType::INT64)) && \
(type != uint8_t(milvus::index::JSONType::FLOAT)) && \
(type != uint8_t(milvus::index::JSONType::DOUBLE))) { \
return false; \
}
#define ISVALIDJSONTYPE(type, GetType) \
if constexpr (std::is_same_v<GetType, int64_t>) { \
CHECKJSONTYPEISNUMBER(type) \
} else if constexpr (std::is_same_v<GetType, std::string_view>) { \
if ((type != uint8_t(milvus::index::JSONType::STRING)) && \
(type != uint8_t(milvus::index::JSONType::STRING_ESCAPE))) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
CHECKJSONTYPEISNUMBER(type) \
} else if constexpr (std::is_same_v<GetType, bool>) { \
if (type != uint8_t(milvus::index::JSONType::BOOL)) { \
return false; \
} \
}
ExprValueType val = GetValueFromProto<ExprValueType>(expr_->val_);
auto op_type = expr_->op_type_;
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
Assert(segment != nullptr);
auto filter_func = [segment,
field_id,
op_type,
val,
arrayIndex,
pointer](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
if (type == uint8_t(milvus::index::JSONType::UNKNOWN) ||
!arrayIndex.empty()) {
return false;
}
ISVALIDJSONTYPE(type, GetType);
switch (op_type) {
case proto::plan::GreaterThan:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::GreaterEqual:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::LessThan:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::LessEqual:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::Equal:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::NotEqual:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::PrefixMatch:
case proto::plan::Match:
default:
return false;
}
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
switch (op_type) {
case proto::plan::GreaterThan:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) > val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x > val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) > val);
}
}
}
case proto::plan::GreaterEqual:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) >= val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x >= val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) >= val);
}
}
}
case proto::plan::LessThan:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) < val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x < val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) < val);
}
}
}
case proto::plan::LessEqual:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) <= val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x <= val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) <= val);
}
}
}
case proto::plan::Equal:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
if (type !=
uint8_t(milvus::index::JSONType::UNKNOWN)) {
return false;
}
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
return CompareTwoJsonArray(array.value(), val);
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) == val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x == val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) == val);
}
}
}
case proto::plan::NotEqual:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
if (type !=
uint8_t(milvus::index::JSONType::UNKNOWN)) {
return false;
}
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
return !CompareTwoJsonArray(array.value(), val);
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareNotEqualWithArrayIndex(
ExprValueType(x.value()) != val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x != val);
} else {
UnaryRangeJSONIndexCompareNotEqual(
ExprValueType(x.value()) != val);
}
}
}
case proto::plan::PrefixMatch:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
milvus::query::Match(
ExprValueType(x.value()),
val,
op_type));
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(
milvus::query::Match(x, val, op_type));
} else {
UnaryRangeJSONIndexCompare(
milvus::query::Match(
ExprValueType(x.value()),
val,
op_type));
}
}
}
case proto::plan::Match:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
PatternMatchTranslator translator;
auto regex_pattern = translator(val);
RegexMatcher matcher(regex_pattern);
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
matcher(ExprValueType(x.value())));
} else {
UnaryRangeJSONIndexCompare(
matcher(ExprValueType(x.value())));
}
}
default:
return false;
}
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
template <typename T> template <typename T>
VectorPtr VectorPtr
PhyUnaryRangeFilterExpr::ExecRangeVisitorImpl(EvalCtx& context) { PhyUnaryRangeFilterExpr::ExecRangeVisitorImpl(EvalCtx& context) {

View File

@ -335,7 +335,8 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
const std::string& name, const std::string& name,
const segcore::SegmentInternalInterface* segment, const segcore::SegmentInternalInterface* segment,
int64_t active_count, int64_t active_count,
int64_t batch_size) int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input), : SegmentExpr(std::move(input),
name, name,
segment, segment,
@ -343,7 +344,8 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
expr->column_.nested_path_, expr->column_.nested_path_,
FromValCase(expr->val_.val_case()), FromValCase(expr->val_.val_case()),
active_count, active_count,
batch_size), batch_size,
consistency_level),
expr_(expr) { expr_(expr) {
} }
@ -411,6 +413,10 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
VectorPtr VectorPtr
ExecRangeVisitorImplJson(EvalCtx& context); ExecRangeVisitorImplJson(EvalCtx& context);
template <typename ExprValueType>
VectorPtr
ExecRangeVisitorImplJsonForIndex();
template <typename ExprValueType> template <typename ExprValueType>
VectorPtr VectorPtr
ExecRangeVisitorImplArray(EvalCtx& context); ExecRangeVisitorImplArray(EvalCtx& context);
@ -442,6 +448,9 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
VectorPtr VectorPtr
ExecTextMatch(); ExecTextMatch();
std::pair<std::string, std::string>
SplitAtFirstSlashDigit(std::string input);
private: private:
std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr> expr_; std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr> expr_;
int64_t overflow_check_pos_{0}; int64_t overflow_check_pos_{0};

View File

@ -64,6 +64,16 @@ CompareTwoJsonArray(T arr1, const proto::plan::Array& arr2) {
simdjson::ondemand::value>>>) { simdjson::ondemand::value>>>) {
json_array_length = arr1.size(); json_array_length = arr1.size();
} }
if constexpr (std::is_same_v<
T,
simdjson::simdjson_result<simdjson::dom::array>>) {
json_array_length = arr1.size();
}
if constexpr (std::is_same_v<T, simdjson::dom::array>) {
json_array_length = arr1.size();
}
if (arr2.array_size() != json_array_length) { if (arr2.array_size() != json_array_length) {
return false; return false;
} }

View File

@ -218,7 +218,7 @@ InvertedIndexTantivy<T>::Load(milvus::tracer::TraceContext ctx,
std::vector<std::string> null_offset_files; std::vector<std::string> null_offset_files;
std::shared_ptr<FieldDataBase> null_offset_data; std::shared_ptr<FieldDataBase> null_offset_data;
auto find_file = [&](const std::string& target) -> auto{ auto find_file = [&](const std::string& target) -> auto {
return std::find_if(inverted_index_files.begin(), return std::find_if(inverted_index_files.begin(),
inverted_index_files.end(), inverted_index_files.end(),
[&](const std::string& filename) { [&](const std::string& filename) {

View File

@ -0,0 +1,476 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <boost/uuid/random_generator.hpp>
#include <boost/uuid/uuid_io.hpp>
#include "index/JsonKeyStatsInvertedIndex.h"
#include "index/InvertedIndexUtil.h"
#include "index/Utils.h"
#include "storage/MmapManager.h"
namespace milvus::index {
constexpr const char* TMP_JSON_INVERTED_LOG_PREFIX =
"/tmp/milvus/json-key-inverted-index-log/";
void
JsonKeyStatsInvertedIndex::AddJSONEncodeValue(
const std::vector<std::string>& paths,
uint8_t flag,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t length,
int32_t value,
std::map<std::string, std::vector<int64_t>>& mp) {
std::string key = "";
if (!paths.empty()) {
key = std::string("/") + Join(paths, "/");
}
LOG_DEBUG(
"insert inverted key: {}, flag: {}, type: {}, row_id: {}, offset: "
"{}, length:{}, value:{}",
key,
flag,
type,
row_id,
offset,
length,
value);
int64_t combine_id = 0;
if (flag) {
combine_id = EncodeValue(flag, type, row_id, value);
} else {
combine_id = EncodeOffset(flag, type, row_id, offset, length);
}
mp[key].push_back(combine_id);
}
void
JsonKeyStatsInvertedIndex::AddInvertedRecord(
std::map<std::string, std::vector<int64_t>>& mp) {
for (auto& iter : mp) {
for (auto value : iter.second) {
wrapper_->add_array_data<std::string>(&iter.first, 1, value);
}
}
}
void
JsonKeyStatsInvertedIndex::TravelJson(
const char* json,
jsmntok* tokens,
int& index,
std::vector<std::string>& path,
int32_t offset,
std::map<std::string, std::vector<int64_t>>& mp) {
jsmntok current = tokens[0];
Assert(current.type != JSMN_UNDEFINED);
if (current.type == JSMN_OBJECT) {
if (!path.empty()) {
AddJSONEncodeValue(path,
0,
0,
offset,
current.start,
current.end - current.start,
0,
mp);
}
int j = 1;
for (int i = 0; i < current.size; i++) {
Assert(tokens[j].type == JSMN_STRING && tokens[j].size != 0);
std::string key(json + tokens[j].start,
tokens[j].end - tokens[j].start);
path.push_back(key);
j++;
int consumed = 0;
TravelJson(json, tokens + j, consumed, path, offset, mp);
path.pop_back();
j += consumed;
}
index = j;
} else if (current.type == JSMN_PRIMITIVE) {
std::string value(json + current.start, current.end - current.start);
auto type = getType(value);
if (type == JSONType::INT32) {
AddJSONEncodeValue(path,
1,
static_cast<uint8_t>(JSONType::INT32),
offset,
current.start,
current.end - current.start,
stoi(value),
mp);
} else if (type == JSONType::INT64) {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::INT64),
offset,
current.start,
current.end - current.start,
0,
mp);
} else if (type == JSONType::FLOAT) {
auto fvalue = stof(value);
int32_t valueBits = *reinterpret_cast<int32_t*>(&fvalue);
AddJSONEncodeValue(path,
1,
static_cast<uint8_t>(JSONType::FLOAT),
offset,
current.start,
current.end - current.start,
valueBits,
mp);
} else if (type == JSONType::DOUBLE) {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::DOUBLE),
offset,
current.start,
current.end - current.start,
0,
mp);
} else if (type == JSONType::BOOL) {
AddJSONEncodeValue(path,
1,
static_cast<uint8_t>(JSONType::BOOL),
offset,
current.start,
current.end - current.start,
value == "true" ? 1 : 0,
mp);
}
index++;
} else if (current.type == JSMN_ARRAY) {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::UNKNOWN),
offset,
current.start,
current.end - current.start,
0,
mp);
// skip array parse
int count = current.size;
int j = 1;
while (count > 0) {
count--;
if (tokens[j].size != 0) {
count += tokens[j].size;
}
j++;
}
index = j;
} else if (current.type == JSMN_STRING) {
Assert(current.size == 0);
std::string value(json + current.start, current.end - current.start);
if (has_escape_sequence(value)) {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::STRING_ESCAPE),
offset,
current.start - 1,
current.end - current.start + 2,
0,
mp);
} else {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::STRING),
offset,
current.start,
current.end - current.start,
0,
mp);
}
index++;
}
}
void
JsonKeyStatsInvertedIndex::AddJson(
const char* json,
int64_t offset,
std::map<std::string, std::vector<int64_t>>& mp) {
jsmn_parser parser;
jsmntok_t* tokens = (jsmntok_t*)malloc(16 * sizeof(jsmntok_t));
if (!tokens) {
PanicInfo(ErrorCode::UnexpectedError, "alloc jsmn token failed");
return;
}
int num_tokens = 0;
int token_capacity = 16;
jsmn_init(&parser);
while (1) {
int r = jsmn_parse(&parser, json, strlen(json), tokens, token_capacity);
if (r < 0) {
if (r == JSMN_ERROR_NOMEM) {
// Reallocate tokens array if not enough space
token_capacity *= 2;
tokens = (jsmntok_t*)realloc(
tokens, token_capacity * sizeof(jsmntok_t));
if (!tokens) {
PanicInfo(ErrorCode::UnexpectedError, "realloc failed");
}
continue;
} else {
free(tokens);
PanicInfo(ErrorCode::UnexpectedError,
"Failed to parse Json: {}, error: {}",
json,
int(r));
}
}
num_tokens = r;
break;
}
int index = 0;
std::vector<std::string> paths;
TravelJson(json, tokens, index, paths, offset, mp);
free(tokens);
}
JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex(
const storage::FileManagerContext& ctx,
bool is_load,
int64_t json_stats_tantivy_memory_budget,
uint32_t tantivy_index_version)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
LOG_INFO("json_stats_tantivy_memory_budget:{}",
json_stats_tantivy_memory_budget);
schema_ = ctx.fieldDataMeta.field_schema;
field_id_ = ctx.fieldDataMeta.field_id;
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
if (is_load) {
auto prefix = disk_file_manager_->GetLocalJsonKeyIndexPrefix();
path_ = prefix;
} else {
auto prefix = disk_file_manager_->GetJsonKeyIndexIdentifier();
path_ = std::string(TMP_JSON_INVERTED_LOG_PREFIX) + prefix;
boost::filesystem::create_directories(path_);
std::string field_name =
std::to_string(disk_file_manager_->GetFieldDataMeta().field_id);
d_type_ = TantivyDataType::Keyword;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field_name.c_str(),
d_type_,
path_.c_str(),
tantivy_index_version,
false,
false,
1,
json_stats_tantivy_memory_budget);
}
}
JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex(
int64_t commit_interval_in_ms, const char* unique_id)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Keyword;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
unique_id, d_type_, "", TANTIVY_INDEX_LATEST_VERSION, false, true);
}
JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex(
int64_t commit_interval_in_ms,
const char* unique_id,
const std::string& path)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
boost::filesystem::path prefix = path;
boost::filesystem::path sub_path = unique_id;
path_ = (prefix / sub_path).string();
boost::filesystem::create_directories(path_);
d_type_ = TantivyDataType::Keyword;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
unique_id, d_type_, path_.c_str(), TANTIVY_INDEX_LATEST_VERSION);
}
IndexStatsPtr
JsonKeyStatsInvertedIndex::Upload(const Config& config) {
finish();
boost::filesystem::path p(path_);
boost::filesystem::directory_iterator end_iter;
for (boost::filesystem::directory_iterator iter(p); iter != end_iter;
iter++) {
if (boost::filesystem::is_directory(*iter)) {
LOG_WARN("{} is a directory", iter->path().string());
} else {
LOG_INFO("trying to add json key inverted index log: {}",
iter->path().string());
AssertInfo(
disk_file_manager_->AddJsonKeyIndexLog(iter->path().string()),
"failed to add json key inverted index log: {}",
iter->path().string());
LOG_INFO("json key inverted index log: {} added",
iter->path().string());
}
}
auto remote_paths_to_size = disk_file_manager_->GetRemotePathsToFileSize();
auto binary_set = Serialize(config);
mem_file_manager_->AddFile(binary_set);
auto remote_mem_path_to_size =
mem_file_manager_->GetRemotePathsToFileSize();
std::vector<SerializedIndexFileInfo> index_files;
index_files.reserve(remote_paths_to_size.size() +
remote_mem_path_to_size.size());
for (auto& file : remote_paths_to_size) {
index_files.emplace_back(disk_file_manager_->GetFileName(file.first),
file.second);
}
for (auto& file : remote_mem_path_to_size) {
index_files.emplace_back(file.first, file.second);
}
return IndexStats::New(mem_file_manager_->GetAddedTotalMemSize() +
disk_file_manager_->GetAddedTotalFileSize(),
std::move(index_files));
}
void
JsonKeyStatsInvertedIndex::Load(milvus::tracer::TraceContext ctx,
const Config& config) {
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
AssertInfo(index_files.has_value(),
"index file paths is empty when load json key index");
for (auto& index_file : index_files.value()) {
boost::filesystem::path p(index_file);
if (!p.has_parent_path()) {
auto remote_prefix =
disk_file_manager_->GetRemoteJsonKeyLogPrefix();
index_file = remote_prefix + "/" + index_file;
}
}
disk_file_manager_->CacheJsonKeyIndexToDisk(index_files.value());
AssertInfo(
tantivy_index_exist(path_.c_str()), "index not exist: {}", path_);
wrapper_ = std::make_shared<TantivyIndexWrapper>(path_.c_str());
LOG_INFO("load json key index done for field id:{} with dir:{}",
field_id_,
path_);
}
void
JsonKeyStatsInvertedIndex::BuildWithFieldData(
const std::vector<FieldDataPtr>& field_datas) {
AssertInfo(schema_.data_type() == proto::schema::DataType::JSON,
"schema data type is {}",
schema_.data_type());
BuildWithFieldData(field_datas, schema_.nullable());
}
void
JsonKeyStatsInvertedIndex::BuildWithFieldData(
const std::vector<FieldDataPtr>& field_datas, bool nullable) {
int64_t offset = 0;
std::map<std::string, std::vector<int64_t>> mp;
if (nullable) {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
continue;
}
AddJson(static_cast<const milvus::Json*>(data->RawValue(i))
->data()
.data(),
offset++,
mp);
}
}
} else {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
AddJson(static_cast<const milvus::Json*>(data->RawValue(i))
->data()
.data(),
offset++,
mp);
}
}
}
AddInvertedRecord(mp);
LOG_INFO("build json key index done for field id:{}", field_id_);
}
void
JsonKeyStatsInvertedIndex::AddJSONDatas(size_t n,
const std::string* jsonDatas,
const bool* valids,
int64_t offset_begin) {
std::map<std::string, std::vector<int64_t>> mp;
for (int i = 0; i < n; i++) {
auto offset = i + offset_begin;
if (valids != nullptr && !valids[i]) {
continue;
}
AddJson(jsonDatas[i].c_str(), offset, mp);
}
AddInvertedRecord(mp);
is_data_uncommitted_ = true;
LOG_INFO("build json key index done for AddJSONDatas");
if (shouldTriggerCommit()) {
Commit();
}
}
void
JsonKeyStatsInvertedIndex::Finish() {
finish();
}
bool
JsonKeyStatsInvertedIndex::shouldTriggerCommit() {
auto span = (std::chrono::duration<double, std::milli>(
stdclock::now() - last_commit_time_.load()))
.count();
return span > commit_interval_in_ms_;
}
void
JsonKeyStatsInvertedIndex::Commit() {
std::unique_lock<std::mutex> lck(mtx_, std::defer_lock);
if (lck.try_lock()) {
is_data_uncommitted_ = false;
wrapper_->commit();
last_commit_time_.store(stdclock::now());
}
}
void
JsonKeyStatsInvertedIndex::Reload() {
std::unique_lock<std::mutex> lck(mtx_, std::defer_lock);
if (lck.try_lock()) {
wrapper_->reload();
}
}
void
JsonKeyStatsInvertedIndex::CreateReader() {
wrapper_->create_reader();
}
} // namespace milvus::index

View File

@ -0,0 +1,298 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <string>
#include <boost/filesystem.hpp>
#include "index/InvertedIndexTantivy.h"
#include "common/jsmn.h"
namespace milvus::index {
enum class JSONType {
UNKNOWN,
BOOL,
INT32,
INT64,
FLOAT,
DOUBLE,
STRING,
STRING_ESCAPE
};
using stdclock = std::chrono::high_resolution_clock;
class JsonKeyStatsInvertedIndex : public InvertedIndexTantivy<std::string> {
public:
explicit JsonKeyStatsInvertedIndex(
const storage::FileManagerContext& ctx,
bool is_load,
int64_t json_stats_tantivy_memory_budget = 16777216,
uint32_t tantivy_index_version = TANTIVY_INDEX_LATEST_VERSION);
explicit JsonKeyStatsInvertedIndex(int64_t commit_interval_in_ms,
const char* unique_id);
explicit JsonKeyStatsInvertedIndex(int64_t commit_interval_in_ms,
const char* unique_id,
const std::string& path);
~JsonKeyStatsInvertedIndex() override{};
public:
IndexStatsPtr
Upload(const Config& config) override;
void
Load(milvus::tracer::TraceContext ctx, const Config& config) override;
void
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
void
BuildWithFieldData(const std::vector<FieldDataPtr>& datas, bool nullable);
const TargetBitmap
FilterByPath(
const std::string& path,
int32_t row,
bool is_growing,
bool is_strong_consistency,
std::function<bool(
bool, uint8_t, uint32_t, uint16_t, uint16_t, int32_t)> filter) {
auto processArray = [this, &path, row, &filter]() {
TargetBitmap bitset(row);
auto array = wrapper_->term_query_i64(path);
LOG_INFO("json key filter size:{}", array.array_.len);
for (size_t j = 0; j < array.array_.len; j++) {
auto the_offset = array.array_.array[j];
if (DecodeValid(the_offset)) {
auto tuple = DecodeValue(the_offset);
auto row_id = std::get<1>(tuple);
if (row_id >= row) {
continue;
}
bitset[row_id] = filter(true,
std::get<0>(tuple),
std::get<1>(tuple),
0,
0,
std::get<2>(tuple));
} else {
auto tuple = DecodeOffset(the_offset);
auto row_id = std::get<1>(tuple);
if (row_id >= row) {
continue;
}
bitset[row_id] = filter(false,
std::get<0>(tuple),
std::get<1>(tuple),
std::get<2>(tuple),
std::get<3>(tuple),
0);
}
}
return bitset;
};
if (is_growing) {
if (shouldTriggerCommit() || is_strong_consistency) {
if (is_data_uncommitted_) {
Commit();
}
Reload();
return processArray();
} else {
return processArray();
}
} else {
return processArray();
}
}
void
AddJSONDatas(size_t n,
const std::string* jsonDatas,
const bool* valids,
int64_t offset_begin);
void
Finish();
void
Commit();
void
Reload();
void
CreateReader();
bool
has_escape_sequence(const std::string& str) {
for (size_t i = 0; i < str.size(); ++i) {
if (str[i] == '\\' && i + 1 < str.size()) {
char next = str[i + 1];
if (next == 'n' || next == 't' || next == 'r' || next == 'b' ||
next == 'f' || next == 'v' || next == '\\' ||
next == '\"' || next == '\'' || next == '0' ||
next == 'u' || next == '/') {
return true;
}
}
}
return false;
}
private:
void
AddJson(const char* json,
int64_t offset,
std::map<std::string, std::vector<int64_t>>& mp);
void
TravelJson(const char* json,
jsmntok* tokens,
int& index,
std::vector<std::string>& path,
int32_t offset,
std::map<std::string, std::vector<int64_t>>& mp);
void
AddJSONEncodeValue(const std::vector<std::string>& paths,
uint8_t flag,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t length,
int32_t value,
std::map<std::string, std::vector<int64_t>>& mp);
int64_t
EncodeOffset(uint8_t flag,
uint8_t type,
uint32_t row_id,
uint16_t row_offset,
uint16_t size) {
row_id &= 0x0FFFFFFF;
return static_cast<int64_t>(flag) << 63 |
static_cast<int64_t>(type) << 60 |
static_cast<int64_t>(row_id) << 32 |
static_cast<int64_t>(row_offset) << 16 |
static_cast<int64_t>(size);
}
int64_t
EncodeValue(uint8_t flag, uint8_t type, uint32_t row_id, int32_t value) {
row_id &= 0x0FFFFFFF;
return static_cast<int64_t>(flag) << 63 |
static_cast<int64_t>(type) << 60 |
static_cast<int64_t>(row_id) << 32 |
static_cast<uint32_t>(value);
}
bool
DecodeValid(int64_t encode_offset) {
return (encode_offset >> 63) & 1;
}
std::tuple<uint8_t, uint32_t, int32_t>
DecodeValue(int64_t encode_offset) {
uint8_t type = (encode_offset >> 60) & 0x7;
uint32_t row_id = (encode_offset >> 32) & 0x0FFFFFFF;
int32_t value = static_cast<int32_t>(encode_offset & 0xFFFFFFFF);
return std::make_tuple(type, row_id, value);
}
std::tuple<uint8_t, uint32_t, uint16_t, uint16_t>
DecodeOffset(int64_t encode_offset) {
uint8_t type = (encode_offset >> 60) & 0x7;
uint32_t row_id = (encode_offset >> 32) & 0x0FFFFFFF;
uint16_t row_offset = (encode_offset >> 16) & 0xFFFF;
uint16_t size = encode_offset & 0xFFFF;
return std::make_tuple(type, row_id, row_offset, size);
}
bool
shouldTriggerCommit();
bool
isBoolean(const std::string& str) {
return str == "true" || str == "false";
}
bool
isInt32(const std::string& str) {
std::istringstream iss(str);
int64_t num;
iss >> num;
return !iss.fail() && iss.eof() &&
num >= std::numeric_limits<int32_t>::min() &&
num <= std::numeric_limits<int32_t>::max();
}
bool
isInt64(const std::string& str) {
std::istringstream iss(str);
int64_t num;
iss >> num;
return !iss.fail() && iss.eof();
}
bool
isFloat(const std::string& str) {
try {
float d = std::stof(str);
return true;
} catch (...) {
return false;
}
}
bool
isDouble(const std::string& str) {
try {
double d = std::stod(str);
return true;
} catch (...) {
return false;
}
}
JSONType
getType(const std::string& str) {
if (isBoolean(str)) {
return JSONType::BOOL;
} else if (isInt32(str)) {
return JSONType::INT32;
} else if (isInt64(str)) {
return JSONType::INT64;
} else if (isFloat(str)) {
return JSONType::FLOAT;
} else if (isDouble(str)) {
return JSONType::DOUBLE;
}
return JSONType::UNKNOWN;
}
void
AddInvertedRecord(std::map<std::string, std::vector<int64_t>>& mp);
private:
int64_t field_id_;
mutable std::mutex mtx_;
std::atomic<stdclock::time_point> last_commit_time_;
int64_t commit_interval_in_ms_;
std::atomic<bool> is_data_uncommitted_ = false;
};
} // namespace milvus::index

View File

@ -34,6 +34,7 @@
#include "pb/index_cgo_msg.pb.h" #include "pb/index_cgo_msg.pb.h"
#include "storage/Util.h" #include "storage/Util.h"
#include "index/Meta.h" #include "index/Meta.h"
#include "index/JsonKeyStatsInvertedIndex.h"
using namespace milvus; using namespace milvus;
CStatus CStatus
@ -225,6 +226,81 @@ CreateIndex(CIndex* res_index,
} }
} }
CStatus
BuildJsonKeyIndex(ProtoLayoutInterface result,
const uint8_t* serialized_build_index_info,
const uint64_t len) {
try {
auto build_index_info =
std::make_unique<milvus::proto::indexcgo::BuildIndexInfo>();
auto res =
build_index_info->ParseFromArray(serialized_build_index_info, len);
AssertInfo(res, "Unmarshall build index info failed");
auto field_type =
static_cast<DataType>(build_index_info->field_schema().data_type());
auto storage_config =
get_storage_config(build_index_info->storage_config());
auto config = get_config(build_index_info);
// init file manager
milvus::storage::FieldDataMeta field_meta{
build_index_info->collectionid(),
build_index_info->partitionid(),
build_index_info->segmentid(),
build_index_info->field_schema().fieldid(),
build_index_info->field_schema()};
milvus::storage::IndexMeta index_meta{
build_index_info->segmentid(),
build_index_info->field_schema().fieldid(),
build_index_info->buildid(),
build_index_info->index_version(),
"",
build_index_info->field_schema().name(),
field_type,
build_index_info->dim(),
};
uint32_t tantivy_index_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::TANTIVY_INDEX_VERSION)
.value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION);
auto chunk_manager =
milvus::storage::CreateChunkManager(storage_config);
milvus::storage::FileManagerContext fileManagerContext(
field_meta, index_meta, chunk_manager);
auto field_schema =
FieldMeta::ParseFrom(build_index_info->field_schema());
auto index = std::make_unique<index::JsonKeyStatsInvertedIndex>(
fileManagerContext,
false,
build_index_info->json_key_stats_tantivy_memory(),
tantivy_index_version);
index->Build(config);
auto create_index_result = index->Upload(config);
create_index_result->SerializeAt(
reinterpret_cast<milvus::ProtoLayout*>(result));
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (SegcoreError& e) {
auto status = CStatus();
status.error_code = e.get_error_code();
status.error_msg = strdup(e.what());
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus CStatus
BuildTextIndex(ProtoLayoutInterface result, BuildTextIndex(ProtoLayoutInterface result,
const uint8_t* serialized_build_index_info, const uint8_t* serialized_build_index_info,

View File

@ -36,6 +36,11 @@ CreateIndex(CIndex* res_index,
CStatus CStatus
DeleteIndex(CIndex index); DeleteIndex(CIndex index);
CStatus
BuildJsonKeyIndex(ProtoLayoutInterface c_binary_set,
const uint8_t* serialized_build_index_info,
const uint64_t len);
CStatus CStatus
BuildTextIndex(ProtoLayoutInterface c_binary_set, BuildTextIndex(ProtoLayoutInterface c_binary_set,
const uint8_t* serialized_build_index_info, const uint8_t* serialized_build_index_info,

View File

@ -147,6 +147,12 @@ class ChunkedColumnBase : public ColumnBase {
"GetBatchBuffer only supported for VariableColumn"); "GetBatchBuffer only supported for VariableColumn");
} }
virtual std::string_view
RawAt(const size_t i) const {
PanicInfo(ErrorCode::Unsupported,
"RawAt only supported for VariableColumn");
}
virtual std::pair<std::vector<std::string_view>, FixedVector<bool>> virtual std::pair<std::vector<std::string_view>, FixedVector<bool>>
StringViews(int64_t chunk_id, StringViews(int64_t chunk_id,
std::optional<std::pair<int64_t, int64_t>> offset_len) const { std::optional<std::pair<int64_t, int64_t>> offset_len) const {
@ -387,7 +393,7 @@ class ChunkedVariableColumn : public ChunkedColumnBase {
} }
std::string_view std::string_view
RawAt(const int i) const { RawAt(const size_t i) const {
return std::string_view((*this)[i]); return std::string_view((*this)[i]);
} }
}; };

View File

@ -333,6 +333,12 @@ class SingleChunkColumnBase : public ColumnBase {
"viewsbyoffsets only supported for VariableColumn"); "viewsbyoffsets only supported for VariableColumn");
} }
virtual std::string_view
RawAt(const size_t i) const {
PanicInfo(ErrorCode::Unsupported,
"RawAt only supported for VariableColumn");
}
virtual void virtual void
AppendBatch(const FieldDataPtr data) override { AppendBatch(const FieldDataPtr data) override {
size_t required_size = data_size_ + data->DataSize(); size_t required_size = data_size_ + data->DataSize();
@ -801,7 +807,7 @@ class SingleChunkVariableColumn : public SingleChunkColumnBase {
} }
std::string_view std::string_view
RawAt(const int i) const { RawAt(const size_t i) const {
return std::string_view((*this)[i]); return std::string_view((*this)[i]);
} }

View File

@ -33,10 +33,12 @@ class ExecPlanNodeVisitor : PlanNodeVisitor {
public: public:
ExecPlanNodeVisitor(const segcore::SegmentInterface& segment, ExecPlanNodeVisitor(const segcore::SegmentInterface& segment,
Timestamp timestamp, Timestamp timestamp,
const PlaceholderGroup& placeholder_group) const PlaceholderGroup& placeholder_group,
int32_t consystency_level)
: segment_(segment), : segment_(segment),
timestamp_(timestamp), timestamp_(timestamp),
placeholder_group_(placeholder_group) { placeholder_group_(placeholder_group),
consystency_level_(consystency_level) {
} }
SearchResult SearchResult
@ -60,6 +62,7 @@ class ExecPlanNodeVisitor : PlanNodeVisitor {
const PlaceholderGroup& placeholder_group_; const PlaceholderGroup& placeholder_group_;
SearchResultOpt search_result_opt_; SearchResultOpt search_result_opt_;
int32_t consystency_level_ = 0;
}; };
} // namespace impl } // namespace impl
@ -80,7 +83,6 @@ ExecPlanNodeVisitor::ExecuteTask(
plan.plan_node_->ToString(), plan.plan_node_->ToString(),
query_context->get_active_count(), query_context->get_active_count(),
query_context->get_query_timestamp()); query_context->get_query_timestamp());
auto task = auto task =
milvus::exec::Task::Create(DEFAULT_TASK_ID, plan, 0, query_context); milvus::exec::Task::Create(DEFAULT_TASK_ID, plan, 0, query_context);
int64_t processed_num = 0; int64_t processed_num = 0;
@ -127,8 +129,12 @@ ExecPlanNodeVisitor::VectorVisitorImpl(VectorPlanNode& node) {
auto plan = plan::PlanFragment(node.plannodes_); auto plan = plan::PlanFragment(node.plannodes_);
// Set query context // Set query context
auto query_context = std::make_shared<milvus::exec::QueryContext>( auto query_context =
DEAFULT_QUERY_ID, segment, active_count, timestamp_); std::make_shared<milvus::exec::QueryContext>(DEAFULT_QUERY_ID,
segment,
active_count,
timestamp_,
consystency_level_);
query_context->set_search_info(node.search_info_); query_context->set_search_info(node.search_info_);
query_context->set_placeholder_group(placeholder_group_); query_context->set_placeholder_group(placeholder_group_);
@ -178,8 +184,12 @@ ExecPlanNodeVisitor::visit(RetrievePlanNode& node) {
auto plan = plan::PlanFragment(node.plannodes_); auto plan = plan::PlanFragment(node.plannodes_);
// Set query context // Set query context
auto query_context = std::make_shared<milvus::exec::QueryContext>( auto query_context =
DEAFULT_QUERY_ID, segment, active_count, timestamp_); std::make_shared<milvus::exec::QueryContext>(DEAFULT_QUERY_ID,
segment,
active_count,
timestamp_,
consystency_level_);
// Do task execution // Do task execution
auto bitset_holder = ExecuteTask(plan, query_context); auto bitset_holder = ExecuteTask(plan, query_context);

View File

@ -46,15 +46,20 @@ class ExecPlanNodeVisitor : public PlanNodeVisitor {
public: public:
ExecPlanNodeVisitor(const segcore::SegmentInterface& segment, ExecPlanNodeVisitor(const segcore::SegmentInterface& segment,
Timestamp timestamp, Timestamp timestamp,
const PlaceholderGroup* placeholder_group) const PlaceholderGroup* placeholder_group,
int32_t consystency_level = 0)
: segment_(segment), : segment_(segment),
timestamp_(timestamp), timestamp_(timestamp),
placeholder_group_(placeholder_group) { placeholder_group_(placeholder_group),
consystency_level_(consystency_level) {
} }
ExecPlanNodeVisitor(const segcore::SegmentInterface& segment, ExecPlanNodeVisitor(const segcore::SegmentInterface& segment,
Timestamp timestamp) Timestamp timestamp,
: segment_(segment), timestamp_(timestamp) { int32_t consystency_level = 0)
: segment_(segment),
timestamp_(timestamp),
consystency_level_(consystency_level) {
placeholder_group_ = nullptr; placeholder_group_ = nullptr;
} }
@ -108,6 +113,7 @@ class ExecPlanNodeVisitor : public PlanNodeVisitor {
SearchResultOpt search_result_opt_; SearchResultOpt search_result_opt_;
RetrieveResultOpt retrieve_result_opt_; RetrieveResultOpt retrieve_result_opt_;
bool expr_use_pk_index_ = false; bool expr_use_pk_index_ = false;
int32_t consystency_level_ = 0;
}; };
// for test use only // for test use only

View File

@ -97,6 +97,31 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
void void
LoadTextIndex(FieldId field_id, LoadTextIndex(FieldId field_id,
std::unique_ptr<index::TextMatchIndex> index) override; std::unique_ptr<index::TextMatchIndex> index) override;
void
LoadJsonKeyIndex(
FieldId field_id,
std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) override {
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
json_key_indexes_[field_id] = std::move(index);
}
index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const override {
std::shared_lock lck(mutex_);
auto iter = json_key_indexes_.find(field_id);
if (iter == json_key_indexes_.end()) {
return nullptr;
}
return iter->second.get();
}
std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const override {
auto column = fields_.at(field_id);
bool is_valid = column->IsValid(offset);
return std::make_pair(std::move(column->RawAt(offset)), is_valid);
}
public: public:
size_t size_t
@ -406,6 +431,10 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
// whether the segment is sorted by the pk // whether the segment is sorted by the pk
bool is_sorted_by_pk_ = false; bool is_sorted_by_pk_ = false;
// used for json expr optimization
std::unordered_map<FieldId,
std::unique_ptr<index::JsonKeyStatsInvertedIndex>>
json_key_indexes_;
}; };
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -24,6 +24,7 @@
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "common/FieldData.h" #include "common/FieldData.h"
#include "common/Types.h" #include "common/Types.h"
#include "common/Common.h"
#include "fmt/format.h" #include "fmt/format.h"
#include "log/Log.h" #include "log/Log.h"
#include "nlohmann/json.hpp" #include "nlohmann/json.hpp"
@ -170,6 +171,33 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
reserved_offset); reserved_offset);
} }
// index json.
if (field_meta.enable_growing_jsonStats()) {
std::vector<std::string> jsonDatas(
insert_record_proto->fields_data(data_offset)
.scalars()
.json_data()
.data()
.begin(),
insert_record_proto->fields_data(data_offset)
.scalars()
.json_data()
.data()
.end());
FixedVector<bool> jsonDatas_valid_data(
insert_record_proto->fields_data(data_offset)
.valid_data()
.begin(),
insert_record_proto->fields_data(data_offset)
.valid_data()
.end());
AddJSONDatas(field_id,
jsonDatas.data(),
jsonDatas_valid_data.data(),
num_rows,
reserved_offset);
}
// update average row data size // update average row data size
auto field_data_size = GetRawDataSizeOfDataArray( auto field_data_size = GetRawDataSizeOfDataArray(
&insert_record_proto->fields_data(data_offset), &insert_record_proto->fields_data(data_offset),
@ -318,6 +346,15 @@ SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) {
index->Reload(); index->Reload();
} }
// build json match index
if (field_meta.enable_growing_jsonStats()) {
auto index = GetJsonKeyIndex(field_id);
index->BuildWithFieldData(field_data, field_meta.is_nullable());
index->Commit();
// Reload reader so that the index can be read immediately
index->Reload();
}
// update the mem size // update the mem size
stats_.mem_size += storage::GetByteSizeOfFieldDatas(field_data); stats_.mem_size += storage::GetByteSizeOfFieldDatas(field_data);
@ -939,4 +976,56 @@ SegmentGrowingImpl::AddTexts(milvus::FieldId field_id,
iter->second->AddTexts(n, texts, texts_valid_data, offset_begin); iter->second->AddTexts(n, texts, texts_valid_data, offset_begin);
} }
void
SegmentGrowingImpl::AddJSONDatas(FieldId field_id,
const std::string* jsondatas,
const bool* jsondatas_valid_data,
size_t n,
int64_t offset_begin) {
std::unique_lock lock(mutex_);
auto iter = json_indexes_.find(field_id);
AssertInfo(iter != json_indexes_.end(), "json index not found");
iter->second->AddJSONDatas(
n, jsondatas, jsondatas_valid_data, offset_begin);
}
void
SegmentGrowingImpl::CreateJSONIndexes() {
for (auto [field_id, field_meta] : schema_->get_fields()) {
if (field_meta.enable_growing_jsonStats()) {
CreateJSONIndex(FieldId(field_id));
}
}
}
void
SegmentGrowingImpl::CreateJSONIndex(FieldId field_id) {
std::unique_lock lock(mutex_);
const auto& field_meta = schema_->operator[](field_id);
AssertInfo(IsJsonDataType(field_meta.get_data_type()),
"cannot create json index on non-json type");
std::string unique_id = GetUniqueFieldId(field_meta.get_id().get());
auto index = std::make_unique<index::JsonKeyStatsInvertedIndex>(
JSON_KEY_STATS_COMMIT_INTERVAL, unique_id.c_str());
index->Commit();
index->CreateReader();
json_indexes_[field_id] = std::move(index);
}
std::pair<std::string_view, bool>
SegmentGrowingImpl::GetJsonData(FieldId field_id, size_t offset) const {
auto vec_ptr = dynamic_cast<const ConcurrentVector<Json>*>(
insert_record_.get_data_base(field_id));
auto& src = *vec_ptr;
auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_nullable()) {
auto valid_data_ptr = insert_record_.get_valid_data(field_id);
return std::make_pair(std::string_view(src[offset]),
valid_data_ptr->is_valid(offset));
}
return std::make_pair(std::string_view(src[offset]), true);
}
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -226,6 +226,9 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t count, int64_t count,
const std::vector<std::string>& dynamic_field_names) const override; const std::vector<std::string>& dynamic_field_names) const override;
virtual std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const override;
public: public:
friend std::unique_ptr<SegmentGrowing> friend std::unique_ptr<SegmentGrowing>
CreateGrowingSegment(SchemaPtr schema, CreateGrowingSegment(SchemaPtr schema,
@ -264,6 +267,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
mcm->Register(mmap_descriptor_); mcm->Register(mmap_descriptor_);
} }
this->CreateTextIndexes(); this->CreateTextIndexes();
this->CreateJSONIndexes();
} }
~SegmentGrowingImpl() { ~SegmentGrowingImpl() {
@ -414,6 +418,19 @@ class SegmentGrowingImpl : public SegmentGrowing {
void void
CreateTextIndexes(); CreateTextIndexes();
void
AddJSONDatas(FieldId field_id,
const std::string* jsondatas,
const bool* jsondatas_valid_data,
size_t n,
int64_t offset_begin);
void
CreateJSONIndexes();
void
CreateJSONIndex(FieldId field_id);
private: private:
storage::MmapChunkDescriptorPtr mmap_descriptor_ = nullptr; storage::MmapChunkDescriptorPtr mmap_descriptor_ = nullptr;
SegcoreConfig segcore_config_; SegcoreConfig segcore_config_;

View File

@ -83,11 +83,13 @@ std::unique_ptr<SearchResult>
SegmentInternalInterface::Search( SegmentInternalInterface::Search(
const query::Plan* plan, const query::Plan* plan,
const query::PlaceholderGroup* placeholder_group, const query::PlaceholderGroup* placeholder_group,
Timestamp timestamp) const { Timestamp timestamp,
int32_t consistency_level) const {
std::shared_lock lck(mutex_); std::shared_lock lck(mutex_);
milvus::tracer::AddEvent("obtained_segment_lock_mutex"); milvus::tracer::AddEvent("obtained_segment_lock_mutex");
check_search(plan); check_search(plan);
query::ExecPlanNodeVisitor visitor(*this, timestamp, placeholder_group); query::ExecPlanNodeVisitor visitor(
*this, timestamp, placeholder_group, consistency_level);
auto results = std::make_unique<SearchResult>(); auto results = std::make_unique<SearchResult>();
*results = visitor.get_moved_result(*plan->plan_node_); *results = visitor.get_moved_result(*plan->plan_node_);
results->segment_ = (void*)this; results->segment_ = (void*)this;
@ -99,11 +101,12 @@ SegmentInternalInterface::Retrieve(tracer::TraceContext* trace_ctx,
const query::RetrievePlan* plan, const query::RetrievePlan* plan,
Timestamp timestamp, Timestamp timestamp,
int64_t limit_size, int64_t limit_size,
bool ignore_non_pk) const { bool ignore_non_pk,
int32_t consistency_level) const {
std::shared_lock lck(mutex_); std::shared_lock lck(mutex_);
tracer::AutoSpan span("Retrieve", tracer::GetRootSpan()); tracer::AutoSpan span("Retrieve", tracer::GetRootSpan());
auto results = std::make_unique<proto::segcore::RetrieveResults>(); auto results = std::make_unique<proto::segcore::RetrieveResults>();
query::ExecPlanNodeVisitor visitor(*this, timestamp); query::ExecPlanNodeVisitor visitor(*this, timestamp, consistency_level);
auto retrieve_results = visitor.get_retrieve_result(*plan->plan_node_); auto retrieve_results = visitor.get_retrieve_result(*plan->plan_node_);
retrieve_results.segment_ = (void*)this; retrieve_results.segment_ = (void*)this;
results->set_has_more_result(retrieve_results.has_more_result); results->set_has_more_result(retrieve_results.has_more_result);
@ -292,7 +295,8 @@ SegmentInternalInterface::get_real_count() const {
milvus::plan::GetNextPlanNodeId(), sources); milvus::plan::GetNextPlanNodeId(), sources);
plan->plan_node_->plannodes_ = plannode; plan->plan_node_->plannodes_ = plannode;
plan->plan_node_->is_count_ = true; plan->plan_node_->is_count_ = true;
auto res = Retrieve(nullptr, plan.get(), MAX_TIMESTAMP, INT64_MAX, false); auto res =
Retrieve(nullptr, plan.get(), MAX_TIMESTAMP, INT64_MAX, false, 0);
AssertInfo(res->fields_data().size() == 1, AssertInfo(res->fields_data().size() == 1,
"count result should only have one column"); "count result should only have one column");
AssertInfo(res->fields_data()[0].has_scalars(), AssertInfo(res->fields_data()[0].has_scalars(),
@ -528,4 +532,13 @@ SegmentInternalInterface::bulk_subscript_not_exist_field(
return result; return result;
} }
index::JsonKeyStatsInvertedIndex*
SegmentInternalInterface::GetJsonKeyIndex(FieldId field_id) const {
std::shared_lock lock(mutex_);
auto iter = json_indexes_.find(field_id);
if (iter == json_indexes_.end()) {
return nullptr;
}
return iter->second.get();
}
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -38,6 +38,7 @@
#include "index/SkipIndex.h" #include "index/SkipIndex.h"
#include "mmap/Column.h" #include "mmap/Column.h"
#include "index/TextMatchIndex.h" #include "index/TextMatchIndex.h"
#include "index/JsonKeyStatsInvertedIndex.h"
namespace milvus::segcore { namespace milvus::segcore {
@ -64,14 +65,16 @@ class SegmentInterface {
virtual std::unique_ptr<SearchResult> virtual std::unique_ptr<SearchResult>
Search(const query::Plan* Plan, Search(const query::Plan* Plan,
const query::PlaceholderGroup* placeholder_group, const query::PlaceholderGroup* placeholder_group,
Timestamp timestamp) const = 0; Timestamp timestamp,
int32_t consistency_level = 0) const = 0;
virtual std::unique_ptr<proto::segcore::RetrieveResults> virtual std::unique_ptr<proto::segcore::RetrieveResults>
Retrieve(tracer::TraceContext* trace_ctx, Retrieve(tracer::TraceContext* trace_ctx,
const query::RetrievePlan* Plan, const query::RetrievePlan* Plan,
Timestamp timestamp, Timestamp timestamp,
int64_t limit_size, int64_t limit_size,
bool ignore_non_pk) const = 0; bool ignore_non_pk,
int32_t consistency_level = 0) const = 0;
virtual std::unique_ptr<proto::segcore::RetrieveResults> virtual std::unique_ptr<proto::segcore::RetrieveResults>
Retrieve(tracer::TraceContext* trace_ctx, Retrieve(tracer::TraceContext* trace_ctx,
@ -139,6 +142,11 @@ class SegmentInterface {
GetJsonIndex(FieldId field_id, std::string path) const { GetJsonIndex(FieldId field_id, std::string path) const {
return nullptr; return nullptr;
} }
virtual index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const = 0;
virtual std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const = 0;
}; };
// internal API for DSL calculation // internal API for DSL calculation
@ -247,7 +255,8 @@ class SegmentInternalInterface : public SegmentInterface {
std::unique_ptr<SearchResult> std::unique_ptr<SearchResult>
Search(const query::Plan* Plan, Search(const query::Plan* Plan,
const query::PlaceholderGroup* placeholder_group, const query::PlaceholderGroup* placeholder_group,
Timestamp timestamp) const override; Timestamp timestamp,
int32_t consistency_level = 0) const override;
void void
FillPrimaryKeys(const query::Plan* plan, FillPrimaryKeys(const query::Plan* plan,
@ -262,7 +271,8 @@ class SegmentInternalInterface : public SegmentInterface {
const query::RetrievePlan* Plan, const query::RetrievePlan* Plan,
Timestamp timestamp, Timestamp timestamp,
int64_t limit_size, int64_t limit_size,
bool ignore_non_pk) const override; bool ignore_non_pk,
int32_t consistency_level = 0) const override;
std::unique_ptr<proto::segcore::RetrieveResults> std::unique_ptr<proto::segcore::RetrieveResults>
Retrieve(tracer::TraceContext* trace_ctx, Retrieve(tracer::TraceContext* trace_ctx,
@ -325,6 +335,9 @@ class SegmentInternalInterface : public SegmentInterface {
index::TextMatchIndex* index::TextMatchIndex*
GetTextIndex(FieldId field_id) const override; GetTextIndex(FieldId field_id) const override;
virtual index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const override;
public: public:
virtual void virtual void
vector_search(SearchInfo& search_info, vector_search(SearchInfo& search_info,
@ -519,6 +532,10 @@ class SegmentInternalInterface : public SegmentInterface {
// text-indexes used to do match. // text-indexes used to do match.
std::unordered_map<FieldId, std::unique_ptr<index::TextMatchIndex>> std::unordered_map<FieldId, std::unique_ptr<index::TextMatchIndex>>
text_indexes_; text_indexes_;
std::unordered_map<FieldId,
std::unique_ptr<index::JsonKeyStatsInvertedIndex>>
json_indexes_;
}; };
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -69,6 +69,17 @@ class SegmentSealed : public SegmentInternalInterface {
return index->second.get(); return index->second.get();
} }
virtual void
LoadJsonKeyIndex(
FieldId field_id,
std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) = 0;
virtual index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const = 0;
virtual std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const = 0;
SegmentType SegmentType
type() const override { type() const override {
return SegmentType::Sealed; return SegmentType::Sealed;

View File

@ -2147,4 +2147,29 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id,
text_indexes_[field_id] = std::move(index); text_indexes_[field_id] = std::move(index);
} }
void
SegmentSealedImpl::LoadJsonKeyIndex(
FieldId field_id, std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) {
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
json_key_indexes_[field_id] = std::move(index);
}
index::JsonKeyStatsInvertedIndex*
SegmentSealedImpl::GetJsonKeyIndex(FieldId field_id) const {
std::shared_lock lck(mutex_);
auto iter = json_key_indexes_.find(field_id);
if (iter == json_key_indexes_.end()) {
return nullptr;
}
return iter->second.get();
}
std::pair<std::string_view, bool>
SegmentSealedImpl::GetJsonData(FieldId field_id, size_t offset) const {
auto column = fields_.at(field_id);
bool is_valid = column->IsValid(offset);
return std::make_pair(std::move(column->RawAt(offset)), is_valid);
}
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -36,6 +36,7 @@
#include "common/Types.h" #include "common/Types.h"
#include "common/IndexMeta.h" #include "common/IndexMeta.h"
#include "index/TextMatchIndex.h" #include "index/TextMatchIndex.h"
#include "index/JsonKeyStatsInvertedIndex.h"
namespace milvus::segcore { namespace milvus::segcore {
@ -100,6 +101,17 @@ class SegmentSealedImpl : public SegmentSealed {
LoadTextIndex(FieldId field_id, LoadTextIndex(FieldId field_id,
std::unique_ptr<index::TextMatchIndex> index) override; std::unique_ptr<index::TextMatchIndex> index) override;
void
LoadJsonKeyIndex(
FieldId field_id,
std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) override;
index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const override;
std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const override;
public: public:
size_t size_t
GetMemoryUsageInBytes() const override { GetMemoryUsageInBytes() const override {
@ -412,6 +424,11 @@ class SegmentSealedImpl : public SegmentSealed {
// whether the segment is sorted by the pk // whether the segment is sorted by the pk
bool is_sorted_by_pk_ = false; bool is_sorted_by_pk_ = false;
// used for json expr optimization
std::unordered_map<FieldId,
std::unique_ptr<index::JsonKeyStatsInvertedIndex>>
json_key_indexes_;
}; };
inline SegmentSealedUPtr inline SegmentSealedUPtr

View File

@ -111,7 +111,8 @@ AsyncSearch(CTraceContext c_trace,
CSegmentInterface c_segment, CSegmentInterface c_segment,
CSearchPlan c_plan, CSearchPlan c_plan,
CPlaceholderGroup c_placeholder_group, CPlaceholderGroup c_placeholder_group,
uint64_t timestamp) { uint64_t timestamp,
int32_t consistency_level) {
auto segment = (milvus::segcore::SegmentInterface*)c_segment; auto segment = (milvus::segcore::SegmentInterface*)c_segment;
auto plan = (milvus::query::Plan*)c_plan; auto plan = (milvus::query::Plan*)c_plan;
auto phg_ptr = reinterpret_cast<const milvus::query::PlaceholderGroup*>( auto phg_ptr = reinterpret_cast<const milvus::query::PlaceholderGroup*>(
@ -120,7 +121,7 @@ AsyncSearch(CTraceContext c_trace,
auto future = milvus::futures::Future<milvus::SearchResult>::async( auto future = milvus::futures::Future<milvus::SearchResult>::async(
milvus::futures::getGlobalCPUExecutor(), milvus::futures::getGlobalCPUExecutor(),
milvus::futures::ExecutePriority::HIGH, milvus::futures::ExecutePriority::HIGH,
[c_trace, segment, plan, phg_ptr, timestamp]( [c_trace, segment, plan, phg_ptr, timestamp, consistency_level](
milvus::futures::CancellationToken cancel_token) { milvus::futures::CancellationToken cancel_token) {
// save trace context into search_info // save trace context into search_info
auto& trace_ctx = plan->plan_node_->search_info_.trace_ctx_; auto& trace_ctx = plan->plan_node_->search_info_.trace_ctx_;
@ -131,7 +132,8 @@ AsyncSearch(CTraceContext c_trace,
auto span = milvus::tracer::StartSpan("SegCoreSearch", &trace_ctx); auto span = milvus::tracer::StartSpan("SegCoreSearch", &trace_ctx);
milvus::tracer::SetRootSpan(span); milvus::tracer::SetRootSpan(span);
auto search_result = segment->Search(plan, phg_ptr, timestamp); auto search_result =
segment->Search(plan, phg_ptr, timestamp, consistency_level);
if (!milvus::PositivelyRelated( if (!milvus::PositivelyRelated(
plan->plan_node_->search_info_.metric_type_)) { plan->plan_node_->search_info_.metric_type_)) {
for (auto& dis : search_result->distances_) { for (auto& dis : search_result->distances_) {
@ -179,21 +181,31 @@ AsyncRetrieve(CTraceContext c_trace,
CRetrievePlan c_plan, CRetrievePlan c_plan,
uint64_t timestamp, uint64_t timestamp,
int64_t limit_size, int64_t limit_size,
bool ignore_non_pk) { bool ignore_non_pk,
int32_t consistency_level) {
auto segment = static_cast<milvus::segcore::SegmentInterface*>(c_segment); auto segment = static_cast<milvus::segcore::SegmentInterface*>(c_segment);
auto plan = static_cast<const milvus::query::RetrievePlan*>(c_plan); auto plan = static_cast<const milvus::query::RetrievePlan*>(c_plan);
auto future = milvus::futures::Future<CRetrieveResult>::async( auto future = milvus::futures::Future<CRetrieveResult>::async(
milvus::futures::getGlobalCPUExecutor(), milvus::futures::getGlobalCPUExecutor(),
milvus::futures::ExecutePriority::HIGH, milvus::futures::ExecutePriority::HIGH,
[c_trace, segment, plan, timestamp, limit_size, ignore_non_pk]( [c_trace,
milvus::futures::CancellationToken cancel_token) { segment,
plan,
timestamp,
limit_size,
ignore_non_pk,
consistency_level](milvus::futures::CancellationToken cancel_token) {
auto trace_ctx = milvus::tracer::TraceContext{ auto trace_ctx = milvus::tracer::TraceContext{
c_trace.traceID, c_trace.spanID, c_trace.traceFlags}; c_trace.traceID, c_trace.spanID, c_trace.traceFlags};
milvus::tracer::AutoSpan span("SegCoreRetrieve", &trace_ctx, true); milvus::tracer::AutoSpan span("SegCoreRetrieve", &trace_ctx, true);
auto retrieve_result = segment->Retrieve( auto retrieve_result = segment->Retrieve(&trace_ctx,
&trace_ctx, plan, timestamp, limit_size, ignore_non_pk); plan,
timestamp,
limit_size,
ignore_non_pk,
consistency_level);
return CreateLeakedCRetrieveResultFromProto( return CreateLeakedCRetrieveResultFromProto(
std::move(retrieve_result)); std::move(retrieve_result));
@ -479,6 +491,60 @@ LoadTextIndex(CSegmentInterface c_segment,
} }
} }
CStatus
LoadJsonKeyIndex(CTraceContext c_trace,
CSegmentInterface c_segment,
const uint8_t* serialized_load_json_key_index_info,
const uint64_t len) {
try {
auto ctx = milvus::tracer::TraceContext{
c_trace.traceID, c_trace.spanID, c_trace.traceFlags};
auto segment_interface =
reinterpret_cast<milvus::segcore::SegmentInterface*>(c_segment);
auto segment =
dynamic_cast<milvus::segcore::SegmentSealed*>(segment_interface);
AssertInfo(segment != nullptr, "segment conversion failed");
auto info_proto =
std::make_unique<milvus::proto::indexcgo::LoadJsonKeyIndexInfo>();
info_proto->ParseFromArray(serialized_load_json_key_index_info, len);
milvus::storage::FieldDataMeta field_meta{info_proto->collectionid(),
info_proto->partitionid(),
segment->get_segment_id(),
info_proto->fieldid(),
info_proto->schema()};
milvus::storage::IndexMeta index_meta{segment->get_segment_id(),
info_proto->fieldid(),
info_proto->buildid(),
info_proto->version()};
auto remote_chunk_manager =
milvus::storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
milvus::Config config;
std::vector<std::string> files;
for (const auto& f : info_proto->files()) {
files.push_back(f);
}
config["index_files"] = files;
milvus::storage::FileManagerContext file_ctx(
field_meta, index_meta, remote_chunk_manager);
auto index = std::make_unique<milvus::index::JsonKeyStatsInvertedIndex>(
file_ctx, true);
index->Load(ctx, config);
segment->LoadJsonKeyIndex(milvus::FieldId(info_proto->fieldid()),
std::move(index));
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(&e);
}
}
CStatus CStatus
UpdateFieldRawDataSize(CSegmentInterface c_segment, UpdateFieldRawDataSize(CSegmentInterface c_segment,
int64_t field_id, int64_t field_id,

View File

@ -50,7 +50,8 @@ AsyncSearch(CTraceContext c_trace,
CSegmentInterface c_segment, CSegmentInterface c_segment,
CSearchPlan c_plan, CSearchPlan c_plan,
CPlaceholderGroup c_placeholder_group, CPlaceholderGroup c_placeholder_group,
uint64_t timestamp); uint64_t timestamp,
int32_t consistency_level);
void void
DeleteRetrieveResult(CRetrieveResult* retrieve_result); DeleteRetrieveResult(CRetrieveResult* retrieve_result);
@ -61,7 +62,8 @@ AsyncRetrieve(CTraceContext c_trace,
CRetrievePlan c_plan, CRetrievePlan c_plan,
uint64_t timestamp, uint64_t timestamp,
int64_t limit_size, int64_t limit_size,
bool ignore_non_pk); bool ignore_non_pk,
int32_t consistency_level);
CFuture* // Future<CRetrieveResult> CFuture* // Future<CRetrieveResult>
AsyncRetrieveByOffsets(CTraceContext c_trace, AsyncRetrieveByOffsets(CTraceContext c_trace,
@ -122,6 +124,12 @@ LoadTextIndex(CSegmentInterface c_segment,
const uint8_t* serialized_load_text_index_info, const uint8_t* serialized_load_text_index_info,
const uint64_t len); const uint64_t len);
CStatus
LoadJsonKeyIndex(CTraceContext c_trace,
CSegmentInterface c_segment,
const uint8_t* serialied_load_json_key_index_info,
const uint64_t len);
CStatus CStatus
UpdateFieldRawDataSize(CSegmentInterface c_segment, UpdateFieldRawDataSize(CSegmentInterface c_segment,
int64_t field_id, int64_t field_id,

View File

@ -79,8 +79,18 @@ DiskFileManagerImpl::GetRemoteTextLogPath(const std::string& file_name,
return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num); return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num);
} }
std::string
DiskFileManagerImpl::GetRemoteJsonKeyIndexPath(const std::string& file_name,
int64_t slice_num) {
auto remote_prefix = GetRemoteJsonKeyLogPrefix();
return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num);
}
bool bool
DiskFileManagerImpl::AddFile(const std::string& file) noexcept { DiskFileManagerImpl::AddFileInternal(
const std::string& file,
const std::function<std::string(const std::string&, int)>&
get_remote_path) noexcept {
auto local_chunk_manager = auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager(); LocalChunkManagerSingleton::GetInstance().GetChunkManager();
FILEMANAGER_TRY FILEMANAGER_TRY
@ -116,8 +126,7 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
} }
auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset); auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset);
batch_remote_files.emplace_back( batch_remote_files.emplace_back(get_remote_path(fileName, slice_num));
GetRemoteIndexPath(fileName, slice_num));
remote_file_sizes.emplace_back(batch_size); remote_file_sizes.emplace_back(batch_size);
local_file_offsets.emplace_back(offset); local_file_offsets.emplace_back(offset);
offset += batch_size; offset += batch_size;
@ -132,58 +141,29 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
return true; return true;
} // namespace knowhere } // namespace knowhere
bool
DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
return AddFileInternal(file,
[this](const std::string& file_name, int slice_num) {
return GetRemoteIndexPath(file_name, slice_num);
});
}
bool
DiskFileManagerImpl::AddJsonKeyIndexLog(const std::string& file) noexcept {
return AddFileInternal(
file, [this](const std::string& file_name, int slice_num) {
return GetRemoteJsonKeyIndexPath(file_name, slice_num);
});
}
bool bool
DiskFileManagerImpl::AddTextLog(const std::string& file) noexcept { DiskFileManagerImpl::AddTextLog(const std::string& file) noexcept {
auto local_chunk_manager = return AddFileInternal(
LocalChunkManagerSingleton::GetInstance().GetChunkManager(); file, [this](const std::string& file_name, int slice_num) {
FILEMANAGER_TRY return GetRemoteTextLogPath(file_name, slice_num);
if (!local_chunk_manager->Exist(file)) { });
LOG_ERROR("local file {} not exists", file); }
return false;
}
// record local file path
local_paths_.emplace_back(file);
auto fileName = GetFileName(file);
auto fileSize = local_chunk_manager->Size(file);
added_total_file_size_ += fileSize;
std::vector<std::string> batch_remote_files;
std::vector<int64_t> remote_file_sizes;
std::vector<int64_t> local_file_offsets;
int slice_num = 0;
auto parallel_degree =
uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE);
for (int64_t offset = 0; offset < fileSize; slice_num++) {
if (batch_remote_files.size() >= parallel_degree) {
AddBatchIndexFiles(file,
local_file_offsets,
batch_remote_files,
remote_file_sizes);
batch_remote_files.clear();
remote_file_sizes.clear();
local_file_offsets.clear();
}
auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset);
batch_remote_files.emplace_back(
GetRemoteTextLogPath(fileName, slice_num));
remote_file_sizes.emplace_back(batch_size);
local_file_offsets.emplace_back(offset);
offset += batch_size;
}
if (batch_remote_files.size() > 0) {
AddBatchIndexFiles(
file, local_file_offsets, batch_remote_files, remote_file_sizes);
}
FILEMANAGER_CATCH
FILEMANAGER_END
return true;
} // namespace knowhere
void void
DiskFileManagerImpl::AddBatchIndexFiles( DiskFileManagerImpl::AddBatchIndexFiles(
@ -238,8 +218,9 @@ DiskFileManagerImpl::AddBatchIndexFiles(
} }
void void
DiskFileManagerImpl::CacheIndexToDisk( DiskFileManagerImpl::CacheIndexToDiskInternal(
const std::vector<std::string>& remote_files) { const std::vector<std::string>& remote_files,
const std::function<std::string()>& get_local_index_prefix) noexcept {
auto local_chunk_manager = auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager(); LocalChunkManagerSingleton::GetInstance().GetChunkManager();
@ -265,7 +246,7 @@ DiskFileManagerImpl::CacheIndexToDisk(
for (auto& slices : index_slices) { for (auto& slices : index_slices) {
auto prefix = slices.first; auto prefix = slices.first;
auto local_index_file_name = auto local_index_file_name =
GetLocalIndexObjectPrefix() + get_local_index_prefix() +
prefix.substr(prefix.find_last_of('/') + 1); prefix.substr(prefix.find_last_of('/') + 1);
local_chunk_manager->CreateFile(local_index_file_name); local_chunk_manager->CreateFile(local_index_file_name);
auto file = auto file =
@ -305,58 +286,25 @@ DiskFileManagerImpl::CacheIndexToDisk(
} }
} }
void
DiskFileManagerImpl::CacheIndexToDisk(
const std::vector<std::string>& remote_files) {
return CacheIndexToDiskInternal(
remote_files, [this]() { return GetLocalIndexObjectPrefix(); });
}
void void
DiskFileManagerImpl::CacheTextLogToDisk( DiskFileManagerImpl::CacheTextLogToDisk(
const std::vector<std::string>& remote_files) { const std::vector<std::string>& remote_files) {
auto local_chunk_manager = return CacheIndexToDiskInternal(
LocalChunkManagerSingleton::GetInstance().GetChunkManager(); remote_files, [this]() { return GetLocalTextIndexPrefix(); });
}
std::map<std::string, std::vector<int>> index_slices; void
for (auto& file_path : remote_files) { DiskFileManagerImpl::CacheJsonKeyIndexToDisk(
auto pos = file_path.find_last_of("_"); const std::vector<std::string>& remote_files) {
AssertInfo(pos > 0, "invalided index file path:{}", file_path); return CacheIndexToDiskInternal(
try { remote_files, [this]() { return GetLocalJsonKeyIndexPrefix(); });
auto idx = std::stoi(file_path.substr(pos + 1));
index_slices[file_path.substr(0, pos)].emplace_back(idx);
} catch (const std::logic_error& e) {
auto err_message = fmt::format(
"invalided text log path:{}, error:{}", file_path, e.what());
LOG_ERROR(err_message);
throw std::logic_error(err_message);
}
}
for (auto& slices : index_slices) {
std::sort(slices.second.begin(), slices.second.end());
}
for (auto& slices : index_slices) {
auto prefix = slices.first;
auto local_index_file_name =
GetLocalTextIndexPrefix() + "/" +
prefix.substr(prefix.find_last_of('/') + 1);
local_chunk_manager->CreateFile(local_index_file_name);
auto file =
File::Open(local_index_file_name, O_CREAT | O_RDWR | O_TRUNC);
// Get the remote files
std::vector<std::string> batch_remote_files;
batch_remote_files.reserve(slices.second.size());
for (int& iter : slices.second) {
auto origin_file = prefix + "_" + std::to_string(iter);
batch_remote_files.push_back(origin_file);
}
auto index_chunks = GetObjectData(rcm_.get(), batch_remote_files);
for (auto& chunk : index_chunks) {
auto index_data = chunk.get()->GetFieldData();
auto index_size = index_data->Size();
auto chunk_data = reinterpret_cast<uint8_t*>(
const_cast<void*>(index_data->Data()));
file.Write(chunk_data, index_size);
}
local_paths_.emplace_back(local_index_file_name);
}
} }
template <typename DataType> template <typename DataType>
@ -649,6 +597,12 @@ DiskFileManagerImpl::GetFileName(const std::string& localfile) {
return localPath.filename().string(); return localPath.filename().string();
} }
std::string
DiskFileManagerImpl::GetIndexIdentifier() {
return GenIndexPathIdentifier(index_meta_.build_id,
index_meta_.index_version);
}
std::string std::string
DiskFileManagerImpl::GetLocalIndexObjectPrefix() { DiskFileManagerImpl::GetLocalIndexObjectPrefix() {
auto local_chunk_manager = auto local_chunk_manager =
@ -657,6 +611,14 @@ DiskFileManagerImpl::GetLocalIndexObjectPrefix() {
local_chunk_manager, index_meta_.build_id, index_meta_.index_version); local_chunk_manager, index_meta_.build_id, index_meta_.index_version);
} }
std::string
DiskFileManagerImpl::GetTextIndexIdentifier() {
return std::to_string(index_meta_.build_id) + "/" +
std::to_string(index_meta_.index_version) + "/" +
std::to_string(field_meta_.segment_id) + "/" +
std::to_string(field_meta_.field_id);
}
std::string std::string
DiskFileManagerImpl::GetLocalTextIndexPrefix() { DiskFileManagerImpl::GetLocalTextIndexPrefix() {
auto local_chunk_manager = auto local_chunk_manager =
@ -669,17 +631,37 @@ DiskFileManagerImpl::GetLocalTextIndexPrefix() {
} }
std::string std::string
DiskFileManagerImpl::GetIndexIdentifier() { DiskFileManagerImpl::GetJsonKeyIndexIdentifier() {
return GenIndexPathIdentifier(index_meta_.build_id, return GenJsonKeyIndexPathIdentifier(index_meta_.build_id,
index_meta_.index_version); index_meta_.index_version,
field_meta_.collection_id,
field_meta_.partition_id,
field_meta_.segment_id,
field_meta_.field_id);
} }
std::string std::string
DiskFileManagerImpl::GetTextIndexIdentifier() { DiskFileManagerImpl::GetLocalJsonKeyIndexPrefix() {
return std::to_string(index_meta_.build_id) + "/" + auto local_chunk_manager =
std::to_string(index_meta_.index_version) + "/" + LocalChunkManagerSingleton::GetInstance().GetChunkManager();
std::to_string(field_meta_.segment_id) + return GenJsonKeyIndexPathPrefix(local_chunk_manager,
std::to_string(field_meta_.field_id); index_meta_.build_id,
index_meta_.index_version,
field_meta_.collection_id,
field_meta_.partition_id,
field_meta_.segment_id,
field_meta_.field_id);
}
std::string
DiskFileManagerImpl::GetRemoteJsonKeyLogPrefix() {
return GenJsonKeyIndexPathPrefix(rcm_,
index_meta_.build_id,
index_meta_.index_version,
field_meta_.collection_id,
field_meta_.partition_id,
field_meta_.segment_id,
field_meta_.field_id);
} }
std::string std::string

View File

@ -51,28 +51,43 @@ class DiskFileManagerImpl : public FileManagerImpl {
bool bool
AddTextLog(const std::string& filename) noexcept; AddTextLog(const std::string& filename) noexcept;
bool
AddJsonKeyIndexLog(const std::string& filename) noexcept;
public: public:
std::string std::string
GetName() const override { GetName() const override {
return "DiskFileManagerImpl"; return "DiskFileManagerImpl";
} }
std::string
GetLocalIndexObjectPrefix();
// Similar to GetTextIndexIdentifier, segment_id and field_id is also required.
std::string
GetLocalTextIndexPrefix();
std::string std::string
GetIndexIdentifier(); GetIndexIdentifier();
std::string
GetLocalIndexObjectPrefix();
// Different from user index, a text index task may have multiple text fields sharing same build_id/task_id. So // Different from user index, a text index task may have multiple text fields sharing same build_id/task_id. So
// segment_id and field_id are required to identify a unique text index, in case that we support multiple index task // segment_id and field_id are required to identify a unique text index, in case that we support multiple index task
// in the same indexnode at the same time later. // in the same indexnode at the same time later.
std::string std::string
GetTextIndexIdentifier(); GetTextIndexIdentifier();
// Similar to GetTextIndexIdentifier, segment_id and field_id is also required.
std::string
GetLocalTextIndexPrefix();
// Used for building index, using this index identifier mode to construct tmp building-index dir.
std::string
GetJsonKeyIndexIdentifier();
// Used for loading index, using this index prefix dir to store index.
std::string
GetLocalJsonKeyIndexPrefix();
// Used for upload index to remote storage, using this index prefix dir as remote storage directory
std::string
GetRemoteJsonKeyLogPrefix();
std::string std::string
GetLocalRawDataObjectPrefix(); GetLocalRawDataObjectPrefix();
@ -92,6 +107,9 @@ class DiskFileManagerImpl : public FileManagerImpl {
void void
CacheTextLogToDisk(const std::vector<std::string>& remote_files); CacheTextLogToDisk(const std::vector<std::string>& remote_files);
void
CacheJsonKeyIndexToDisk(const std::vector<std::string>& remote_files);
void void
AddBatchIndexFiles(const std::string& local_file_name, AddBatchIndexFiles(const std::string& local_file_name,
const std::vector<int64_t>& local_file_offsets, const std::vector<int64_t>& local_file_offsets,
@ -115,21 +133,34 @@ class DiskFileManagerImpl : public FileManagerImpl {
return added_total_file_size_; return added_total_file_size_;
} }
std::string
GetFileName(const std::string& localfile);
private: private:
int64_t int64_t
GetIndexBuildId() { GetIndexBuildId() {
return index_meta_.build_id; return index_meta_.build_id;
} }
std::string
GetFileName(const std::string& localfile);
std::string std::string
GetRemoteIndexPath(const std::string& file_name, int64_t slice_num) const; GetRemoteIndexPath(const std::string& file_name, int64_t slice_num) const;
std::string std::string
GetRemoteTextLogPath(const std::string& file_name, int64_t slice_num) const; GetRemoteTextLogPath(const std::string& file_name, int64_t slice_num) const;
std::string
GetRemoteJsonKeyIndexPath(const std::string& file_name, int64_t slice_num);
bool
AddFileInternal(const std::string& file_name,
const std::function<std::string(const std::string&, int)>&
get_remote_path) noexcept;
void
CacheIndexToDiskInternal(
const std::vector<std::string>& remote_files,
const std::function<std::string()>& get_local_index_prefix) noexcept;
private: private:
// local file path (abs path) // local file path (abs path)
std::vector<std::string> local_paths_; std::vector<std::string> local_paths_;

View File

@ -549,6 +549,37 @@ GenTextIndexPathPrefix(ChunkManagerPtr cm,
return (prefix / path / path1).string(); return (prefix / path / path1).string();
} }
std::string
GenJsonKeyIndexPathIdentifier(int64_t build_id,
int64_t index_version,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id) {
return std::to_string(build_id) + "/" + std::to_string(index_version) +
"/" + std::to_string(collection_id) + "/" +
std::to_string(partition_id) + "/" + std::to_string(segment_id) +
"/" + std::to_string(field_id) + "/";
}
std::string
GenJsonKeyIndexPathPrefix(ChunkManagerPtr cm,
int64_t build_id,
int64_t index_version,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id) {
return cm->GetRootPath() + "/" + std::string(JSON_KEY_INDEX_LOG_ROOT_PATH) +
"/" +
GenJsonKeyIndexPathIdentifier(build_id,
index_version,
collection_id,
partition_id,
segment_id,
field_id);
}
std::string std::string
GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id) { GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id) {
boost::filesystem::path prefix = cm->GetRootPath(); boost::filesystem::path prefix = cm->GetRootPath();

View File

@ -92,6 +92,23 @@ GenTextIndexPathPrefix(ChunkManagerPtr cm,
int64_t segment_id, int64_t segment_id,
int64_t field_id); int64_t field_id);
std::string
GenJsonKeyIndexPathIdentifier(int64_t build_id,
int64_t index_version,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id);
std::string
GenJsonKeyIndexPathPrefix(ChunkManagerPtr cm,
int64_t build_id,
int64_t index_version,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id);
std::string std::string
GenFieldRawDataPathPrefix(ChunkManagerPtr cm, GenFieldRawDataPathPrefix(ChunkManagerPtr cm,
int64_t segment_id, int64_t segment_id,

View File

@ -69,6 +69,68 @@ struct RustArrayWrapper {
} }
} }
}; };
struct RustArrayI64Wrapper {
NO_COPY_OR_ASSIGN(RustArrayI64Wrapper);
explicit RustArrayI64Wrapper(RustArrayI64&& array) {
array_.array = array.array;
array_.len = array.len;
array_.cap = array.cap;
array.array = nullptr;
array.len = 0;
array.cap = 0;
}
RustArrayI64Wrapper(RustArrayI64Wrapper&& other) noexcept {
array_.array = other.array_.array;
array_.len = other.array_.len;
array_.cap = other.array_.cap;
other.array_.array = nullptr;
other.array_.len = 0;
other.array_.cap = 0;
}
RustArrayI64Wrapper&
operator=(RustArrayI64Wrapper&& other) noexcept {
if (this != &other) {
free();
array_.array = other.array_.array;
array_.len = other.array_.len;
array_.cap = other.array_.cap;
other.array_.array = nullptr;
other.array_.len = 0;
other.array_.cap = 0;
}
return *this;
}
~RustArrayI64Wrapper() {
free();
}
void
debug() {
std::stringstream ss;
ss << "[ ";
for (int i = 0; i < array_.len; i++) {
ss << array_.array[i] << " ";
}
ss << "]";
std::cout << ss.str() << std::endl;
}
RustArrayI64 array_;
private:
void
free() {
if (array_.array != nullptr) {
free_rust_array_i64(array_);
}
}
};
struct RustResultWrapper { struct RustResultWrapper {
NO_COPY_OR_ASSIGN(RustResultWrapper); NO_COPY_OR_ASSIGN(RustResultWrapper);

View File

@ -149,6 +149,8 @@ RustResult tantivy_term_query_bool(void *ptr, bool term);
RustResult tantivy_term_query_keyword(void *ptr, const char *term); RustResult tantivy_term_query_keyword(void *ptr, const char *term);
RustResult tantivy_term_query_keyword_i64(void *ptr, const char *term);
RustResult tantivy_lower_bound_range_query_keyword(void *ptr, RustResult tantivy_lower_bound_range_query_keyword(void *ptr,
const char *lower_bound, const char *lower_bound,
bool inclusive); bool inclusive);
@ -180,7 +182,8 @@ RustResult tantivy_create_index(const char *field_name,
const char *path, const char *path,
uint32_t tantivy_index_version, uint32_t tantivy_index_version,
uintptr_t num_threads, uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes); uintptr_t overall_memory_budget_in_bytes,
bool in_ram);
RustResult tantivy_create_index_with_single_segment(const char *field_name, RustResult tantivy_create_index_with_single_segment(const char *field_name,
TantivyDataType data_type, TantivyDataType data_type,

View File

@ -120,7 +120,7 @@ macro_rules! impl_from_for_enum {
}; };
} }
impl_from_for_enum!(Value, None => (), RustArrayI64 => RustArrayI64, RustArray => RustArray, RustArray => Vec<u32>, U32 => u32, Ptr => *mut c_void); impl_from_for_enum!(Value, None => (), RustArrayI64 => RustArrayI64, RustArrayI64 => Vec<i64>, RustArray => RustArray, RustArray => Vec<u32>, U32 => u32, Ptr => *mut c_void);
#[repr(C)] #[repr(C)]
pub struct RustResult { pub struct RustResult {
@ -202,7 +202,7 @@ macro_rules! cstr_to_str {
#[no_mangle] #[no_mangle]
pub extern "C" fn test_enum_with_array() -> RustResult { pub extern "C" fn test_enum_with_array() -> RustResult {
let array = vec![1, 2, 3]; let array: Vec<u32> = vec![1, 2, 3];
RustResult::from(Result::Ok(array)) RustResult::from(Result::Ok(array))
} }

View File

@ -162,7 +162,7 @@ impl IndexReaderWrapper {
} }
pub fn term_query_f64(&self, term: f64) -> Result<Vec<u32>> { pub fn term_query_f64(&self, term: f64) -> Result<Vec<u32>> {
let q = TermQuery::new( let q: TermQuery = TermQuery::new(
Term::from_field_f64(self.field, term), Term::from_field_f64(self.field, term),
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
@ -235,6 +235,14 @@ impl IndexReaderWrapper {
self.search(&q) self.search(&q)
} }
pub fn term_query_keyword_i64(&self, term: &str) -> Result<Vec<i64>> {
let q = TermQuery::new(
Term::from_field_text(self.field, term),
IndexRecordOption::Basic,
);
self.search_i64(&q)
}
pub fn lower_bound_range_query_keyword( pub fn lower_bound_range_query_keyword(
&self, &self,
lower_bound: &str, lower_bound: &str,

View File

@ -192,6 +192,13 @@ pub extern "C" fn tantivy_term_query_keyword(ptr: *mut c_void, term: *const c_ch
unsafe { (*real).term_query_keyword(term).into() } unsafe { (*real).term_query_keyword(term).into() }
} }
#[no_mangle]
pub extern "C" fn tantivy_term_query_keyword_i64(ptr: *mut c_void, term: *const c_char) -> RustResult {
let real = ptr as *mut IndexReaderWrapper;
let term = cstr_to_str!(term);
unsafe { (*real).term_query_keyword_i64(term).into() }
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_lower_bound_range_query_keyword( pub extern "C" fn tantivy_lower_bound_range_query_keyword(
ptr: *mut c_void, ptr: *mut c_void,

View File

@ -11,6 +11,7 @@ impl IndexReaderWrapper {
// split the query string into multiple tokens using index's default tokenizer, // split the query string into multiple tokens using index's default tokenizer,
// and then execute the disconjunction of term query. // and then execute the disconjunction of term query.
pub(crate) fn match_query(&self, q: &str) -> Result<Vec<u32>> { pub(crate) fn match_query(&self, q: &str) -> Result<Vec<u32>> {
// clone the tokenizer to make `match_query` thread-safe.
let mut tokenizer = self let mut tokenizer = self
.index .index
.tokenizer_for_field(self.field) .tokenizer_for_field(self.field)

View File

@ -29,6 +29,7 @@ impl IndexWriterWrapper {
num_threads: usize, num_threads: usize,
overall_memory_budget_in_bytes: usize, overall_memory_budget_in_bytes: usize,
tanviy_index_version: TantivyIndexVersion, tanviy_index_version: TantivyIndexVersion,
in_ram: bool,
) -> Result<IndexWriterWrapper> { ) -> Result<IndexWriterWrapper> {
init_log(); init_log();
match tanviy_index_version { match tanviy_index_version {
@ -39,6 +40,7 @@ impl IndexWriterWrapper {
path, path,
num_threads, num_threads,
overall_memory_budget_in_bytes, overall_memory_budget_in_bytes,
in_ram,
)?; )?;
Ok(IndexWriterWrapper::V5(writer)) Ok(IndexWriterWrapper::V5(writer))
} }
@ -49,12 +51,12 @@ impl IndexWriterWrapper {
path, path,
num_threads, num_threads,
overall_memory_budget_in_bytes, overall_memory_budget_in_bytes,
in_ram,
)?; )?;
Ok(IndexWriterWrapper::V7(writer)) Ok(IndexWriterWrapper::V7(writer))
} }
} }
} }
pub fn new_with_single_segment( pub fn new_with_single_segment(
field_name: &str, field_name: &str,
data_type: TantivyDataType, data_type: TantivyDataType,

View File

@ -28,6 +28,7 @@ pub extern "C" fn tantivy_create_index(
tantivy_index_version: u32, tantivy_index_version: u32,
num_threads: usize, num_threads: usize,
overall_memory_budget_in_bytes: usize, overall_memory_budget_in_bytes: usize,
in_ram : bool,
) -> RustResult { ) -> RustResult {
let field_name_str = cstr_to_str!(field_name); let field_name_str = cstr_to_str!(field_name);
let path_str = cstr_to_str!(path); let path_str = cstr_to_str!(path);
@ -44,6 +45,7 @@ pub extern "C" fn tantivy_create_index(
num_threads, num_threads,
overall_memory_budget_in_bytes, overall_memory_budget_in_bytes,
tantivy_index_version, tantivy_index_version,
in_ram,
) { ) {
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)), Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
Err(e) => RustResult::from_error(e.to_string()), Err(e) => RustResult::from_error(e.to_string()),

View File

@ -104,6 +104,7 @@ impl IndexWriterWrapperImpl {
path: String, path: String,
num_threads: usize, num_threads: usize,
overall_memory_budget_in_bytes: usize, overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> Result<IndexWriterWrapperImpl> { ) -> Result<IndexWriterWrapperImpl> {
info!( info!(
"create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5", "create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5",
@ -114,7 +115,11 @@ impl IndexWriterWrapperImpl {
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field. // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
let id_field = schema_builder.add_i64_field("doc_id", FAST); let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?; let index = if in_ram {
Index::create_in_ram(schema)
} else {
Index::create_in_dir(path.clone(), schema)?
};
let index_writer = let index_writer =
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapperImpl { Ok(IndexWriterWrapperImpl {

View File

@ -103,6 +103,7 @@ impl IndexWriterWrapperImpl {
path: String, path: String,
num_threads: usize, num_threads: usize,
overall_memory_budget_in_bytes: usize, overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> Result<IndexWriterWrapperImpl> { ) -> Result<IndexWriterWrapperImpl> {
info!( info!(
"create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7", "create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7",
@ -113,7 +114,11 @@ impl IndexWriterWrapperImpl {
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field. // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
let id_field = schema_builder.add_i64_field("doc_id", FAST); let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?; let index = if in_ram {
Index::create_in_ram(schema)
} else {
Index::create_in_dir(path.clone(), schema)?
};
let index_writer = let index_writer =
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapperImpl { Ok(IndexWriterWrapperImpl {

View File

@ -84,6 +84,7 @@ struct TantivyIndexWrapper {
const char* path, const char* path,
uint32_t tantivy_index_version, uint32_t tantivy_index_version,
bool inverted_single_semgnent = false, bool inverted_single_semgnent = false,
bool in_ram = false,
uintptr_t num_threads = DEFAULT_NUM_THREADS, uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes = uintptr_t overall_memory_budget_in_bytes =
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) { DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
@ -101,7 +102,8 @@ struct TantivyIndexWrapper {
path, path,
tantivy_index_version, tantivy_index_version,
num_threads, num_threads,
overall_memory_budget_in_bytes)); overall_memory_budget_in_bytes,
in_ram));
} }
AssertInfo(res.result_->success, AssertInfo(res.result_->success,
"failed to create index: {}", "failed to create index: {}",
@ -146,7 +148,6 @@ struct TantivyIndexWrapper {
writer_ = res.result_->value.ptr._0; writer_ = res.result_->value.ptr._0;
path_ = std::string(path); path_ = std::string(path);
} }
// create reader. // create reader.
void void
create_reader() { create_reader() {
@ -626,6 +627,22 @@ struct TantivyIndexWrapper {
return RustArrayWrapper(std::move(res.result_->value.rust_array._0)); return RustArrayWrapper(std::move(res.result_->value.rust_array._0));
} }
RustArrayI64Wrapper
term_query_i64(std::string term) {
auto array = [&]() {
return tantivy_term_query_keyword_i64(reader_, term.c_str());
}();
auto res = RustResultWrapper(array);
AssertInfo(res.result_->success,
"TantivyIndexWrapper.term_query_i64: {}",
res.result_->error);
AssertInfo(res.result_->value.tag == Value::Tag::RustArrayI64,
"TantivyIndexWrapper.term_query_i64: invalid result type");
return RustArrayI64Wrapper(
std::move(res.result_->value.rust_array_i64._0));
}
template <typename T> template <typename T>
RustArrayWrapper RustArrayWrapper
lower_bound_range_query(T lower_bound, bool inclusive) { lower_bound_range_query(T lower_bound, bool inclusive) {

View File

@ -96,6 +96,7 @@ set(MILVUS_TEST_FILES
test_cached_search_iterator.cpp test_cached_search_iterator.cpp
test_random_sample.cpp test_random_sample.cpp
test_json_index.cpp test_json_index.cpp
test_json_key_stats_index.cpp
) )
if ( INDEX_ENGINE STREQUAL "cardinal" ) if ( INDEX_ENGINE STREQUAL "cardinal" )

View File

@ -93,7 +93,7 @@ Search_GrowingIndex(benchmark::State& state) {
Timestamp ts = 10000000; Timestamp ts = 10000000;
for (auto _ : state) { for (auto _ : state) {
auto qr = segment->Search(search_plan.get(), ph_group.get(), ts); auto qr = segment->Search(search_plan.get(), ph_group.get(), ts, 0);
} }
} }
@ -130,7 +130,7 @@ Search_Sealed(benchmark::State& state) {
Timestamp ts = 10000000; Timestamp ts = 10000000;
for (auto _ : state) { for (auto _ : state) {
auto qr = segment->Search(search_plan.get(), ph_group.get(), ts); auto qr = segment->Search(search_plan.get(), ph_group.get(), ts, 0);
} }
} }

View File

@ -229,7 +229,7 @@ TEST_P(BinlogIndexTest, AccuracyWithLoadFieldData) {
ph_group.get()}; ph_group.get()};
auto nlist = segcore_config.get_nlist(); auto nlist = segcore_config.get_nlist();
auto binlog_index_sr = auto binlog_index_sr =
segment->Search(plan.get(), ph_group.get(), 1L << 63); segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
ASSERT_EQ(binlog_index_sr->total_nq_, num_queries); ASSERT_EQ(binlog_index_sr->total_nq_, num_queries);
EXPECT_EQ(binlog_index_sr->unity_topK_, topk); EXPECT_EQ(binlog_index_sr->unity_topK_, topk);
EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk); EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk);
@ -262,7 +262,7 @@ TEST_P(BinlogIndexTest, AccuracyWithLoadFieldData) {
EXPECT_TRUE(segment->HasIndex(vec_field_id)); EXPECT_TRUE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n); EXPECT_EQ(segment->get_row_count(), data_n);
EXPECT_FALSE(segment->HasFieldData(vec_field_id)); EXPECT_FALSE(segment->HasFieldData(vec_field_id));
auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63); auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
auto similary = GetKnnSearchRecall(num_queries, auto similary = GetKnnSearchRecall(num_queries,
binlog_index_sr->seg_offsets_.data(), binlog_index_sr->seg_offsets_.data(),
topk, topk,
@ -328,7 +328,7 @@ TEST_P(BinlogIndexTest, AccuracyWithMapFieldData) {
ph_group.get()}; ph_group.get()};
auto nlist = segcore_config.get_nlist(); auto nlist = segcore_config.get_nlist();
auto binlog_index_sr = auto binlog_index_sr =
segment->Search(plan.get(), ph_group.get(), 1L << 63); segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
ASSERT_EQ(binlog_index_sr->total_nq_, num_queries); ASSERT_EQ(binlog_index_sr->total_nq_, num_queries);
EXPECT_EQ(binlog_index_sr->unity_topK_, topk); EXPECT_EQ(binlog_index_sr->unity_topK_, topk);
EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk); EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk);

View File

@ -46,6 +46,7 @@
#include "segcore/load_index_c.h" #include "segcore/load_index_c.h"
#include "test_utils/c_api_test_utils.h" #include "test_utils/c_api_test_utils.h"
#include "segcore/vector_index_c.h" #include "segcore/vector_index_c.h"
#include "common/jsmn.h"
namespace chrono = std::chrono; namespace chrono = std::chrono;
@ -69,7 +70,7 @@ CRetrieve(CSegmentInterface c_segment,
uint64_t timestamp, uint64_t timestamp,
CRetrieveResult** result) { CRetrieveResult** result) {
auto future = AsyncRetrieve( auto future = AsyncRetrieve(
{}, c_segment, c_plan, timestamp, DEFAULT_MAX_OUTPUT_SIZE, false); {}, c_segment, c_plan, timestamp, DEFAULT_MAX_OUTPUT_SIZE, false, 0);
auto futurePtr = static_cast<milvus::futures::IFuture*>( auto futurePtr = static_cast<milvus::futures::IFuture*>(
static_cast<void*>(static_cast<CFuture*>(future))); static_cast<void*>(static_cast<CFuture*>(future)));

View File

@ -157,6 +157,7 @@ TEST_P(TaskTest, CallExprEmpty) {
segment_.get(), segment_.get(),
100000, 100000,
MAX_TIMESTAMP, MAX_TIMESTAMP,
0,
std::make_shared<milvus::exec::QueryConfig>( std::make_shared<milvus::exec::QueryConfig>(
std::unordered_map<std::string, std::string>{})); std::unordered_map<std::string, std::string>{}));
@ -194,6 +195,7 @@ TEST_P(TaskTest, UnaryExpr) {
segment_.get(), segment_.get(),
100000, 100000,
MAX_TIMESTAMP, MAX_TIMESTAMP,
0,
std::make_shared<milvus::exec::QueryConfig>( std::make_shared<milvus::exec::QueryConfig>(
std::unordered_map<std::string, std::string>{})); std::unordered_map<std::string, std::string>{}));
@ -240,6 +242,7 @@ TEST_P(TaskTest, LogicalExpr) {
segment_.get(), segment_.get(),
100000, 100000,
MAX_TIMESTAMP, MAX_TIMESTAMP,
0,
std::make_shared<milvus::exec::QueryConfig>( std::make_shared<milvus::exec::QueryConfig>(
std::unordered_map<std::string, std::string>{})); std::unordered_map<std::string, std::string>{}));

View File

@ -59,14 +59,18 @@ using namespace milvus;
using namespace milvus::query; using namespace milvus::query;
using namespace milvus::segcore; using namespace milvus::segcore;
class ExprTest : public ::testing::TestWithParam< class ExprTest
std::pair<milvus::DataType, knowhere::MetricType>> { : public ::testing::TestWithParam<
std::tuple<std::pair<milvus::DataType, knowhere::MetricType>, bool>> {
public: public:
void void
SetUp() override { SetUp() override {
auto param = GetParam(); auto param = GetParam();
data_type = param.first; data_type = std::get<0>(param).first; // Get the DataType from the pair
metric_type = param.second; metric_type =
std::get<0>(param).second; // Get the MetricType from the pair
GROWING_JSON_KEY_STATS_ENABLED =
std::get<1>(param); // Get the bool parameter
} }
// replace the metric type in the plan string with the proper type // replace the metric type in the plan string with the proper type
@ -81,13 +85,29 @@ class ExprTest : public ::testing::TestWithParam<
knowhere::MetricType metric_type; knowhere::MetricType metric_type;
}; };
// Instantiate test suite with new bool parameter
INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(
ExprTestSuite, ExprTestSuite,
ExprTest, ExprTest,
::testing::Values( ::testing::Values(
std::pair(milvus::DataType::VECTOR_FLOAT, knowhere::metric::L2), std::make_tuple(std::pair(milvus::DataType::VECTOR_FLOAT,
std::pair(milvus::DataType::VECTOR_SPARSE_FLOAT, knowhere::metric::IP), knowhere::metric::L2),
std::pair(milvus::DataType::VECTOR_BINARY, knowhere::metric::JACCARD))); false),
std::make_tuple(std::pair(milvus::DataType::VECTOR_SPARSE_FLOAT,
knowhere::metric::IP),
false),
std::make_tuple(std::pair(milvus::DataType::VECTOR_BINARY,
knowhere::metric::JACCARD),
false),
std::make_tuple(std::pair(milvus::DataType::VECTOR_FLOAT,
knowhere::metric::L2),
true),
std::make_tuple(std::pair(milvus::DataType::VECTOR_SPARSE_FLOAT,
knowhere::metric::IP),
true),
std::make_tuple(std::pair(milvus::DataType::VECTOR_BINARY,
knowhere::metric::JACCARD),
true)));
TEST_P(ExprTest, Range) { TEST_P(ExprTest, Range) {
SUCCEED(); SUCCEED();
@ -842,7 +862,7 @@ TEST_P(ExprTest, TestBinaryRangeJSON) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (auto testcase : testcases) { for (auto testcase : testcases) {
auto check = [&](int64_t value) { auto check = [&](int64_t value) {
@ -966,7 +986,7 @@ TEST_P(ExprTest, TestBinaryRangeJSONNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (auto testcase : testcases) { for (auto testcase : testcases) {
auto check = [&](int64_t value, bool valid) { auto check = [&](int64_t value, bool valid) {
@ -1085,7 +1105,7 @@ TEST_P(ExprTest, TestExistsJson) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
for (auto testcase : testcases) { for (auto testcase : testcases) {
@ -1162,7 +1182,7 @@ TEST_P(ExprTest, TestExistsJsonNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
for (auto testcase : testcases) { for (auto testcase : testcases) {
@ -1245,16 +1265,13 @@ TEST_P(ExprTest, TestUnaryRangeJson) {
int64_t val; int64_t val;
std::vector<std::string> nested_path; std::vector<std::string> nested_path;
}; };
std::vector<Testcase> testcases{ std::vector<Testcase> testcases{{10, {"int"}},
{10, {"int"}}, {20, {"int"}},
{20, {"int"}}, {30, {"int"}},
{30, {"int"}}, {40, {"int"}},
{40, {"int"}}, {1, {"array", "0"}},
{10, {"double"}}, {2, {"array", "1"}},
{20, {"double"}}, {3, {"array", "2"}}};
{30, {"double"}},
{40, {"double"}},
};
auto schema = std::make_shared<Schema>(); auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64); auto i64_fid = schema->AddDebugField("id", DataType::INT64);
@ -1278,7 +1295,7 @@ TEST_P(ExprTest, TestUnaryRangeJson) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<OpType> ops{ std::vector<OpType> ops{
@ -1356,13 +1373,16 @@ TEST_P(ExprTest, TestUnaryRangeJson) {
for (int i = 0; i < N * num_iters; ++i) { for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i]; auto ans = final[i];
if (testcase.nested_path[0] == "int") { if (testcase.nested_path[0] == "int" ||
testcase.nested_path[0] == "array") {
auto val = auto val =
milvus::Json(simdjson::padded_string(json_col[i])) milvus::Json(simdjson::padded_string(json_col[i]))
.template at<int64_t>(pointer) .template at<int64_t>(pointer)
.value(); .value();
auto ref = f(val); auto ref = f(val);
ASSERT_EQ(ans, ref); ASSERT_EQ(ans, ref) << "@" << i << "op" << op;
if (i % 2 == 0) { if (i % 2 == 0) {
ASSERT_EQ(view[int(i / 2)], ref); ASSERT_EQ(view[int(i / 2)], ref);
} }
@ -1381,6 +1401,272 @@ TEST_P(ExprTest, TestUnaryRangeJson) {
} }
} }
{
struct Testcase {
int64_t val;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{{1.1, {"double"}},
{2.2, {"double"}},
{3.3, {"double"}},
{4.4, {"double"}},
{1e40, {"double"}}};
auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64);
auto json_fid = schema->AddDebugField("json", DataType::JSON);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> json_col;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_json_col = raw_data.get_col<std::string>(json_fid);
json_col.insert(
json_col.end(), new_json_col.begin(), new_json_col.end());
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<OpType> ops{
OpType::Equal,
OpType::NotEqual,
OpType::GreaterThan,
OpType::GreaterEqual,
OpType::LessThan,
OpType::LessEqual,
};
for (const auto& testcase : testcases) {
auto check = [&](double value) { return value == testcase.val; };
std::function<bool(double)> f = check;
for (auto& op : ops) {
switch (op) {
case OpType::Equal: {
f = [&](double value) { return value == testcase.val; };
break;
}
case OpType::NotEqual: {
f = [&](double value) { return value != testcase.val; };
break;
}
case OpType::GreaterEqual: {
f = [&](double value) { return value >= testcase.val; };
break;
}
case OpType::GreaterThan: {
f = [&](double value) { return value > testcase.val; };
break;
}
case OpType::LessEqual: {
f = [&](double value) { return value <= testcase.val; };
break;
}
case OpType::LessThan: {
f = [&](double value) { return value < testcase.val; };
break;
}
default: {
PanicInfo(Unsupported, "unsupported range node");
}
}
auto pointer = milvus::Json::pointer(testcase.nested_path);
proto::plan::GenericValue value;
value.set_float_val(testcase.val);
auto expr =
std::make_shared<milvus::expr::UnaryRangeFilterExpr>(
milvus::expr::ColumnInfo(
json_fid, DataType::JSON, testcase.nested_path),
op,
value,
std::vector<proto::plan::GenericValue>{});
auto plan = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, expr);
auto final = ExecuteQueryExpr(
plan, seg_promote, N * num_iters, MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
// specify some offsets and do scalar filtering on these offsets
milvus::exec::OffsetVector offsets;
offsets.reserve(N * num_iters / 2);
for (auto i = 0; i < N * num_iters; ++i) {
if (i % 2 == 0) {
offsets.emplace_back(i);
}
}
auto col_vec = milvus::test::gen_filter_res(plan.get(),
seg_promote,
N * num_iters,
MAX_TIMESTAMP,
&offsets);
BitsetTypeView view(col_vec->GetRawData(), col_vec->size());
EXPECT_EQ(view.size(), N * num_iters / 2);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val =
milvus::Json(simdjson::padded_string(json_col[i]))
.template at<double>(pointer)
.value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
if (i % 2 == 0) {
ASSERT_EQ(view[int(i / 2)], ref);
}
}
}
}
}
{
struct Testcase {
std::string val;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{"abc", {"string"}},
{"This is a line break\\nThis is a new line!", {"string"}}};
auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64);
auto json_fid = schema->AddDebugField("json", DataType::JSON);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> json_col;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_json_col = raw_data.get_col<std::string>(json_fid);
json_col.insert(
json_col.end(), new_json_col.begin(), new_json_col.end());
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<OpType> ops{
OpType::Equal,
OpType::NotEqual,
OpType::GreaterThan,
OpType::GreaterEqual,
OpType::LessThan,
OpType::LessEqual,
};
for (const auto& testcase : testcases) {
auto check = [&](std::string_view value) {
return value == testcase.val;
};
std::function<bool(std::string_view)> f = check;
for (auto& op : ops) {
switch (op) {
case OpType::Equal: {
f = [&](std::string_view value) {
return value == testcase.val;
};
break;
}
case OpType::NotEqual: {
f = [&](std::string_view value) {
return value != testcase.val;
};
break;
}
case OpType::GreaterEqual: {
f = [&](std::string_view value) {
return value >= testcase.val;
};
break;
}
case OpType::GreaterThan: {
f = [&](std::string_view value) {
return value > testcase.val;
};
break;
}
case OpType::LessEqual: {
f = [&](std::string_view value) {
return value <= testcase.val;
};
break;
}
case OpType::LessThan: {
f = [&](std::string_view value) {
return value < testcase.val;
};
break;
}
default: {
PanicInfo(Unsupported, "unsupported range node");
}
}
auto pointer = milvus::Json::pointer(testcase.nested_path);
proto::plan::GenericValue value;
value.set_string_val(testcase.val);
auto expr =
std::make_shared<milvus::expr::UnaryRangeFilterExpr>(
milvus::expr::ColumnInfo(
json_fid, DataType::JSON, testcase.nested_path),
op,
value,
std::vector<proto::plan::GenericValue>{});
auto plan = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, expr);
auto final = ExecuteQueryExpr(
plan, seg_promote, N * num_iters, MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
// specify some offsets and do scalar filtering on these offsets
milvus::exec::OffsetVector offsets;
offsets.reserve(N * num_iters / 2);
for (auto i = 0; i < N * num_iters; ++i) {
if (i % 2 == 0) {
offsets.emplace_back(i);
}
}
auto col_vec = milvus::test::gen_filter_res(plan.get(),
seg_promote,
N * num_iters,
MAX_TIMESTAMP,
&offsets);
BitsetTypeView view(col_vec->GetRawData(), col_vec->size());
EXPECT_EQ(view.size(), N * num_iters / 2);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val =
milvus::Json(simdjson::padded_string(json_col[i]))
.template at<std::string_view>(pointer)
.value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
if (i % 2 == 0) {
ASSERT_EQ(view[int(i / 2)], ref);
}
}
}
}
}
struct TestArrayCase { struct TestArrayCase {
proto::plan::GenericValue val; proto::plan::GenericValue val;
std::vector<std::string> nested_path; std::vector<std::string> nested_path;
@ -1457,16 +1743,13 @@ TEST_P(ExprTest, TestUnaryRangeJsonNullable) {
int64_t val; int64_t val;
std::vector<std::string> nested_path; std::vector<std::string> nested_path;
}; };
std::vector<Testcase> testcases{ std::vector<Testcase> testcases{{10, {"int"}},
{10, {"int"}}, {20, {"int"}},
{20, {"int"}}, {30, {"int"}},
{30, {"int"}}, {40, {"int"}},
{40, {"int"}}, {1, {"array", "0"}},
{10, {"double"}}, {2, {"array", "1"}},
{20, {"double"}}, {3, {"array", "2"}}};
{30, {"double"}},
{40, {"double"}},
};
auto schema = std::make_shared<Schema>(); auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64); auto i64_fid = schema->AddDebugField("id", DataType::INT64);
@ -1492,7 +1775,7 @@ TEST_P(ExprTest, TestUnaryRangeJsonNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
std::vector<OpType> ops{ std::vector<OpType> ops{
@ -1717,7 +2000,7 @@ TEST_P(ExprTest, TestTermJson) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
for (auto testcase : testcases) { for (auto testcase : testcases) {
@ -1810,7 +2093,7 @@ TEST_P(ExprTest, TestTermJsonNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
for (auto testcase : testcases) { for (auto testcase : testcases) {
@ -11573,7 +11856,7 @@ TEST_P(ExprTest, TestUnaryRangeWithJSON) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0; int offset = 0;
@ -11833,7 +12116,7 @@ TEST_P(ExprTest, TestUnaryRangeWithJSONNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0; int offset = 0;
@ -12139,7 +12422,7 @@ TEST_P(ExprTest, TestTermWithJSON) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0; int offset = 0;
@ -12372,7 +12655,7 @@ TEST_P(ExprTest, TestTermWithJSONNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0; int offset = 0;
@ -12550,7 +12833,7 @@ TEST_P(ExprTest, TestExistsWithJSON) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0; int offset = 0;
@ -12778,7 +13061,7 @@ TEST_P(ExprTest, TestExistsWithJSONNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0; int offset = 0;
@ -13661,7 +13944,7 @@ TEST_P(ExprTest, TestJsonContainsAny) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
@ -13951,7 +14234,7 @@ TEST_P(ExprTest, TestJsonContainsAnyNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
@ -14252,7 +14535,7 @@ TEST_P(ExprTest, TestJsonContainsAll) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<Testcase<bool>> bool_testcases{{{true, true}, {"bool"}}, std::vector<Testcase<bool>> bool_testcases{{{true, true}, {"bool"}},
@ -14566,7 +14849,7 @@ TEST_P(ExprTest, TestJsonContainsAllNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<Testcase<bool>> bool_testcases{{{true, true}, {"bool"}}, std::vector<Testcase<bool>> bool_testcases{{{true, true}, {"bool"}},
@ -14890,7 +15173,7 @@ TEST_P(ExprTest, TestJsonContainsArray) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
@ -15278,7 +15561,7 @@ TEST_P(ExprTest, TestJsonContainsArrayNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
proto::plan::GenericValue generic_a; proto::plan::GenericValue generic_a;
@ -15702,7 +15985,7 @@ TEST_P(ExprTest, TestJsonContainsDiffTypeArray) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
proto::plan::GenericValue int_value; proto::plan::GenericValue int_value;
@ -15833,7 +16116,7 @@ TEST_P(ExprTest, TestJsonContainsDiffTypeArrayNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
@ -15968,7 +16251,7 @@ TEST_P(ExprTest, TestJsonContainsDiffType) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
proto::plan::GenericValue int_val; proto::plan::GenericValue int_val;
@ -16103,7 +16386,7 @@ TEST_P(ExprTest, TestJsonContainsDiffTypeNullable) {
raw_data.timestamps_.data(), raw_data.timestamps_.data(),
raw_data.raw_); raw_data.raw_);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);

View File

@ -0,0 +1,588 @@
// Copyright(C) 2019 - 2020 Zilliz.All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <functional>
#include <boost/filesystem.hpp>
#include <unordered_set>
#include <memory>
#include "common/Tracer.h"
#include "index/BitmapIndex.h"
#include "storage/Util.h"
#include "storage/InsertData.h"
#include "indexbuilder/IndexFactory.h"
#include "index/IndexFactory.h"
#include "test_utils/indexbuilder_test_utils.h"
#include "index/Meta.h"
#include "index/JsonKeyStatsInvertedIndex.h"
#include "common/Json.h"
#include "common/Types.h"
using namespace milvus::index;
using namespace milvus::indexbuilder;
using namespace milvus;
using namespace milvus::index;
static std::vector<milvus::Json>
GenerateJsons(int size) {
std::vector<Json> jsons;
std::default_random_engine random(42);
std::normal_distribution<> distr(0, 1);
for (int i = 0; i < size; i++) {
auto str = R"({"int":)" + std::to_string(random()) + R"(,"double":)" +
std::to_string(static_cast<double>(random())) +
R"(,"string":")" + std::to_string(random()) +
R"(","bool": true)" + R"(, "array": [1,2,3])" + "}";
jsons.push_back(milvus::Json(simdjson::padded_string(str)));
}
return jsons;
}
class JsonKeyStatsIndexTest : public ::testing::TestWithParam<bool> {
protected:
void
Init(int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id,
int64_t index_build_id,
int64_t index_version,
int64_t size) {
proto::schema::FieldSchema field_schema;
field_schema.set_data_type(proto::schema::DataType::JSON);
field_schema.set_nullable(nullable_);
auto field_meta = storage::FieldDataMeta{
collection_id, partition_id, segment_id, field_id, field_schema};
auto index_meta = storage::IndexMeta{
segment_id, field_id, index_build_id, index_version};
data_ = std::move(GenerateJsons(size));
auto field_data = storage::CreateFieldData(DataType::JSON, nullable_);
if (nullable_) {
valid_data.reserve(size_);
for (size_t i = 0; i < size_; i++) {
valid_data.push_back(false);
}
}
if (nullable_) {
int byteSize = (size_ + 7) / 8;
uint8_t* valid_data_ = new uint8_t[byteSize];
for (int i = 0; i < size_; i++) {
bool value = valid_data[i];
int byteIndex = i / 8;
int bitIndex = i % 8;
if (value) {
valid_data_[byteIndex] |= (1 << bitIndex);
} else {
valid_data_[byteIndex] &= ~(1 << bitIndex);
}
}
field_data->FillFieldData(data_.data(), valid_data_, data_.size());
delete[] valid_data_;
} else {
field_data->FillFieldData(data_.data(), data_.size());
}
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
auto serialized_bytes = insert_data.Serialize(storage::Remote);
auto log_path = fmt::format("/{}/{}/{}/{}/{}/{}",
"/tmp/test-jsonkey-index/",
collection_id,
partition_id,
segment_id,
field_id,
0);
chunk_manager_->Write(
log_path, serialized_bytes.data(), serialized_bytes.size());
storage::FileManagerContext ctx(field_meta, index_meta, chunk_manager_);
std::vector<std::string> index_files;
Config config;
config["insert_files"] = std::vector<std::string>{log_path};
auto build_index =
std::make_shared<JsonKeyStatsInvertedIndex>(ctx, false);
build_index->Build(config);
auto create_index_result = build_index->Upload(config);
auto memSize = create_index_result->GetMemSize();
auto serializedSize = create_index_result->GetSerializedSize();
ASSERT_GT(memSize, 0);
ASSERT_GT(serializedSize, 0);
index_files = create_index_result->GetIndexFiles();
index::CreateIndexInfo index_info{};
config["index_files"] = index_files;
index_ = std::make_shared<JsonKeyStatsInvertedIndex>(ctx, true);
index_->Load(milvus::tracer::TraceContext{}, config);
}
void
SetUp() override {
nullable_ = GetParam();
type_ = DataType::JSON;
int64_t collection_id = 1;
int64_t partition_id = 2;
int64_t segment_id = 3;
int64_t field_id = 101;
int64_t index_build_id = 1000;
int64_t index_version = 10000;
size_ = 1;
std::string root_path = "/tmp/test-jsonkey-index/";
storage::StorageConfig storage_config;
storage_config.storage_type = "local";
storage_config.root_path = root_path;
chunk_manager_ = storage::CreateChunkManager(storage_config);
Init(collection_id,
partition_id,
segment_id,
field_id,
index_build_id,
index_version,
size_);
}
virtual ~JsonKeyStatsIndexTest() override {
boost::filesystem::remove_all(chunk_manager_->GetRootPath());
}
public:
std::shared_ptr<JsonKeyStatsInvertedIndex> index_;
DataType type_;
bool nullable_;
size_t size_;
FixedVector<bool> valid_data;
std::vector<milvus::Json> data_;
std::vector<std::string> json_col;
std::shared_ptr<storage::ChunkManager> chunk_manager_;
};
INSTANTIATE_TEST_SUITE_P(JsonKeyStatsIndexTestSuite,
JsonKeyStatsIndexTest,
::testing::Values(true, false));
TEST_P(JsonKeyStatsIndexTest, HasEscapeSequence) {
EXPECT_TRUE(index_->has_escape_sequence("Hello\\nWorld"));
EXPECT_TRUE(index_->has_escape_sequence("Tab\\tCharacter"));
EXPECT_TRUE(index_->has_escape_sequence("Carriage\\rReturn"));
EXPECT_TRUE(index_->has_escape_sequence("Backspace\\bTest"));
EXPECT_TRUE(index_->has_escape_sequence("FormFeed\\fTest"));
EXPECT_TRUE(index_->has_escape_sequence("Vertical\\vTab"));
EXPECT_TRUE(index_->has_escape_sequence("Backslash\\\\Test"));
EXPECT_TRUE(index_->has_escape_sequence("Quote\\\"Test"));
EXPECT_TRUE(index_->has_escape_sequence("SingleQuote\\'Test"));
EXPECT_FALSE(index_->has_escape_sequence("No escape sequence here"));
EXPECT_FALSE(index_->has_escape_sequence("Just a backslash \\"));
EXPECT_FALSE(index_->has_escape_sequence(""));
}
TEST_P(JsonKeyStatsIndexTest, TestTermInFunc) {
struct Testcase {
std::vector<int64_t> term;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{{1, 2, 3, 4}, {"int"}},
{{10, 100, 1000, 10000}, {"int"}},
{{100, 10000, 9999, 444}, {"int"}},
{{23, 42, 66, 17, 25}, {"int"}},
};
for (auto testcase : testcases) {
auto check = [&](int64_t value) {
std::unordered_set<int64_t> term_set(testcase.term.begin(),
testcase.term.end());
return term_set.find(value) != term_set.end();
};
std::unordered_set<int64_t> term_set(testcase.term.begin(),
testcase.term.end());
auto filter_func = [&term_set, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
return term_set.find(int64_t(value)) != term_set.end();
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto val = data_[i].template at<int64_t>(pointer).value();
auto ans = bitset[i];
auto ref = check(val);
ASSERT_EQ(ans, ref);
}
}
}
}
TEST_P(JsonKeyStatsIndexTest, TestUnaryRangeInFunc) {
struct Testcase {
int64_t val;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{10, {"int"}},
{20, {"int"}},
{30, {"int"}},
{40, {"int"}},
};
std::vector<OpType> ops{
OpType::Equal,
OpType::NotEqual,
OpType::GreaterThan,
OpType::GreaterEqual,
OpType::LessThan,
OpType::LessEqual,
};
for (const auto& testcase : testcases) {
auto check = [&](int64_t value) { return value == testcase.val; };
std::function<bool(int64_t)> f = check;
for (auto& op : ops) {
switch (op) {
case OpType::Equal: {
f = [&](int64_t value) { return value == testcase.val; };
break;
}
case OpType::NotEqual: {
f = [&](int64_t value) { return value != testcase.val; };
break;
}
case OpType::GreaterEqual: {
f = [&](int64_t value) { return value >= testcase.val; };
break;
}
case OpType::GreaterThan: {
f = [&](int64_t value) { return value > testcase.val; };
break;
}
case OpType::LessEqual: {
f = [&](int64_t value) { return value <= testcase.val; };
break;
}
case OpType::LessThan: {
f = [&](int64_t value) { return value < testcase.val; };
break;
}
default: {
PanicInfo(Unsupported, "unsupported range node");
}
}
auto filter_func = [&op, &testcase, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
switch (op) {
case OpType::GreaterThan:
return int64_t(value) > testcase.val;
case OpType::GreaterEqual:
return int64_t(value) >= testcase.val;
case OpType::LessThan:
return int64_t(value) < testcase.val;
case OpType::LessEqual:
return int64_t(value) <= testcase.val;
case OpType::Equal:
return int64_t(value) == testcase.val;
case OpType::NotEqual:
return int64_t(value) != testcase.val;
default:
return false;
}
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto ans = bitset[i];
if (testcase.nested_path[0] == "int") {
auto val =
data_[i].template at<int64_t>(pointer).value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
} else {
auto val =
data_[i].template at<double>(pointer).value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
}
}
}
}
}
}
TEST_P(JsonKeyStatsIndexTest, TestBinaryRangeInFunc) {
struct Testcase {
bool lower_inclusive;
bool upper_inclusive;
int64_t lower;
int64_t upper;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{true, false, 10, 20, {"int"}},
{true, true, 20, 30, {"int"}},
{false, true, 30, 40, {"int"}},
{false, false, 40, 50, {"int"}},
{true, false, 10, 20, {"double"}},
{true, true, 20, 30, {"double"}},
{false, true, 30, 40, {"double"}},
{false, false, 40, 50, {"double"}},
};
for (const auto& testcase : testcases) {
auto check = [&](int64_t value) {
if (testcase.lower_inclusive && testcase.upper_inclusive) {
return testcase.lower <= value && value <= testcase.upper;
} else if (testcase.lower_inclusive && !testcase.upper_inclusive) {
return testcase.lower <= value && value < testcase.upper;
} else if (!testcase.lower_inclusive && testcase.upper_inclusive) {
return testcase.lower < value && value <= testcase.upper;
} else {
return testcase.lower < value && value < testcase.upper;
}
};
auto filter_func = [&testcase, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
if (testcase.lower_inclusive && testcase.upper_inclusive) {
return testcase.lower <= int64_t(value) &&
int64_t(value) <= testcase.upper;
} else if (testcase.lower_inclusive &&
!testcase.upper_inclusive) {
return testcase.lower <= int64_t(value) &&
int64_t(value) < testcase.upper;
} else if (!testcase.lower_inclusive &&
testcase.upper_inclusive) {
return testcase.lower < int64_t(value) &&
int64_t(value) <= testcase.upper;
} else {
return testcase.lower < int64_t(value) &&
int64_t(value) < testcase.upper;
}
} else {
auto val =
this->data_[row_id].template at<int64_t>(offset, size);
if (val.error()) {
return false;
}
if (testcase.lower_inclusive && testcase.upper_inclusive) {
return testcase.lower <= int64_t(val.value()) &&
int64_t(val.value()) <= testcase.upper;
} else if (testcase.lower_inclusive &&
!testcase.upper_inclusive) {
return testcase.lower <= int64_t(val.value()) &&
int64_t(val.value()) < testcase.upper;
} else if (!testcase.lower_inclusive &&
testcase.upper_inclusive) {
return testcase.lower < int64_t(val.value()) &&
int64_t(val.value()) <= testcase.upper;
} else {
return testcase.lower < int64_t(val.value()) &&
int64_t(val.value()) < testcase.upper;
}
}
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto ans = bitset[i];
if (testcase.nested_path[0] == "int") {
auto val = data_[i].template at<int64_t>(pointer).value();
auto ref = check(val);
ASSERT_EQ(ans, ref);
} else {
auto val = data_[i].template at<double>(pointer).value();
auto ref = check(val);
ASSERT_EQ(ans, ref);
}
}
}
}
}
TEST_P(JsonKeyStatsIndexTest, TestExistInFunc) {
struct Testcase {
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{{"A"}},
{{"int"}},
{{"double"}},
{{"B"}},
};
for (const auto& testcase : testcases) {
auto pointer = milvus::Json::pointer(testcase.nested_path);
auto filter_func = [&pointer, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
return this->data_[row_id].exist(pointer);
};
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto ans = bitset[i];
auto val = data_[i].exist(pointer);
ASSERT_EQ(ans, val);
}
}
}
}
TEST_P(JsonKeyStatsIndexTest, TestJsonContainsAllFunc) {
struct Testcase {
std::vector<int64_t> term;
std::vector<std::string> nested_path;
};
{
std::vector<Testcase> testcases{
{{1, 2, 3}, {"array"}},
{{10, 100}, {"array"}},
{{100, 1000}, {"array"}},
};
for (const auto& testcase : testcases) {
auto check = [&](const std::vector<int64_t>& values) {
for (auto const& e : testcase.term) {
if (std::find(values.begin(), values.end(), e) ==
values.end()) {
return false;
}
}
return true;
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
std::unordered_set<int64_t> elements;
for (auto const& element : testcase.term) {
elements.insert(element);
}
auto filter_func = [&elements, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
auto array = this->data_[row_id].array_at(offset, size);
std::unordered_set<int64_t> tmp_elements(elements);
for (auto&& it : array) {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
tmp_elements.erase(val.value());
if (tmp_elements.size() == 0) {
return true;
}
}
return tmp_elements.empty();
};
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto ans = bitset[i];
auto array = data_[i].array_at(pointer);
std::vector<int64_t> res;
for (const auto& element : array) {
res.push_back(element.template get<int64_t>());
}
ASSERT_EQ(ans, check(res));
}
}
}
}
}
TEST(GrowingJsonKeyStatsIndexTest, GrowingIndex) {
using Index = index::JsonKeyStatsInvertedIndex;
auto index = std::make_unique<Index>(std::numeric_limits<int64_t>::max(),
"json",
"/tmp/test-jsonkey-index/");
auto str = R"({"int":)" + std::to_string(1) + R"(,"double":)" +
std::to_string(static_cast<double>(1)) + R"(,"string":")" +
std::to_string(1) + R"(","bool": true)" +
R"(, "array": [1,2,3])" + "}";
auto str1 = R"({"int":)" + std::to_string(2) + "}";
auto str2 = R"({"int":)" + std::to_string(3) + "}";
std::vector<std::string> jsonDatas;
jsonDatas.push_back(str);
jsonDatas.push_back(str1);
jsonDatas.push_back(str2);
std::vector<milvus::Json> jsons;
for (const auto& jsonData : jsonDatas) {
jsons.push_back(milvus::Json(simdjson::padded_string(jsonData)));
}
index->CreateReader();
index->AddJSONDatas(jsonDatas.size(), jsonDatas.data(), nullptr, 0);
index->Commit();
index->Reload();
int64_t checkVal = 1;
auto filter_func = [jsons, checkVal](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (value == checkVal) {
return true;
}
return false;
};
auto pointer = milvus::Json::pointer({"int"});
auto bitset =
index->FilterByPath(pointer, jsonDatas.size(), true, true, filter_func);
ASSERT_EQ(bitset.size(), jsonDatas.size());
for (int i = 0; i < bitset.size(); ++i) {
auto val = jsons[i].template at<int64_t>(pointer).value();
auto ans = bitset[i];
auto ref = val == checkVal;
ASSERT_EQ(ans, ref);
}
}

View File

@ -171,7 +171,7 @@ TEST_F(GrowingSegmentRegexQueryTest, RegexQueryOnJsonField) {
auto typed_expr = parser.ParseExprs(*expr); auto typed_expr = parser.ParseExprs(*expr);
auto parsed = auto parsed =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr); std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto segpromote = dynamic_cast<SegmentGrowingImpl*>(seg.get()); auto segpromote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
BitsetType final; BitsetType final;
final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP); final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP);

View File

@ -187,7 +187,7 @@ CSearch(CSegmentInterface c_segment,
uint64_t timestamp, uint64_t timestamp,
CSearchResult* result) { CSearchResult* result) {
auto future = auto future =
AsyncSearch({}, c_segment, c_plan, c_placeholder_group, timestamp); AsyncSearch({}, c_segment, c_plan, c_placeholder_group, timestamp, 0);
auto futurePtr = static_cast<milvus::futures::IFuture*>( auto futurePtr = static_cast<milvus::futures::IFuture*>(
static_cast<void*>(static_cast<CFuture*>(future))); static_cast<void*>(static_cast<CFuture*>(future)));

View File

@ -54,7 +54,8 @@ get_default_mmap_config() {
.disk_limit = .disk_limit =
uint64_t(2) * uint64_t(1024) * uint64_t(1024) * uint64_t(1024), uint64_t(2) * uint64_t(1024) * uint64_t(1024) * uint64_t(1024),
.fix_file_size = uint64_t(4) * uint64_t(1024) * uint64_t(1024), .fix_file_size = uint64_t(4) * uint64_t(1024) * uint64_t(1024),
.growing_enable_mmap = false}; .growing_enable_mmap = false,
};
return mmap_config; return mmap_config;
} }

View File

@ -164,6 +164,7 @@ func (gc *garbageCollector) work(ctx context.Context) {
gc.recycleUnusedSegIndexes(ctx) gc.recycleUnusedSegIndexes(ctx)
gc.recycleUnusedAnalyzeFiles(ctx) gc.recycleUnusedAnalyzeFiles(ctx)
gc.recycleUnusedTextIndexFiles(ctx) gc.recycleUnusedTextIndexFiles(ctx)
gc.recycleUnusedJSONIndexFiles(ctx)
}) })
}() }()
go func() { go func() {
@ -470,11 +471,16 @@ func (gc *garbageCollector) recycleDroppedSegments(ctx context.Context) {
logs[key] = struct{}{} logs[key] = struct{}{}
} }
for key := range getJSONKeyLogs(segment, gc) {
logs[key] = struct{}{}
}
log.Info("GC segment start...", zap.Int("insert_logs", len(segment.GetBinlogs())), log.Info("GC segment start...", zap.Int("insert_logs", len(segment.GetBinlogs())),
zap.Int("delta_logs", len(segment.GetDeltalogs())), zap.Int("delta_logs", len(segment.GetDeltalogs())),
zap.Int("stats_logs", len(segment.GetStatslogs())), zap.Int("stats_logs", len(segment.GetStatslogs())),
zap.Int("bm25_logs", len(segment.GetBm25Statslogs())), zap.Int("bm25_logs", len(segment.GetBm25Statslogs())),
zap.Int("text_logs", len(segment.GetTextStatsLogs()))) zap.Int("text_logs", len(segment.GetTextStatsLogs())),
zap.Int("json_key_logs", len(segment.GetJsonKeyStats())))
if err := gc.removeObjectFiles(ctx, logs); err != nil { if err := gc.removeObjectFiles(ctx, logs); err != nil {
log.Warn("GC segment remove logs failed", zap.Error(err)) log.Warn("GC segment remove logs failed", zap.Error(err))
continue continue
@ -585,6 +591,20 @@ func getTextLogs(sinfo *SegmentInfo) map[string]struct{} {
return textLogs return textLogs
} }
func getJSONKeyLogs(sinfo *SegmentInfo, gc *garbageCollector) map[string]struct{} {
jsonkeyLogs := make(map[string]struct{})
for _, flog := range sinfo.GetJsonKeyStats() {
for _, file := range flog.GetFiles() {
prefix := fmt.Sprintf("%s/%s/%d/%d/%d/%d/%d/%d", gc.option.cli.RootPath(), common.JSONIndexPath,
flog.GetBuildID(), flog.GetVersion(), sinfo.GetCollectionID(), sinfo.GetPartitionID(), sinfo.GetID(), flog.GetFieldID())
file = path.Join(prefix, file)
jsonkeyLogs[file] = struct{}{}
}
}
return jsonkeyLogs
}
// removeObjectFiles remove file from oss storage, return error if any log failed to remove. // removeObjectFiles remove file from oss storage, return error if any log failed to remove.
func (gc *garbageCollector) removeObjectFiles(ctx context.Context, filePaths map[string]struct{}) error { func (gc *garbageCollector) removeObjectFiles(ctx context.Context, filePaths map[string]struct{}) error {
futures := make([]*conc.Future[struct{}], 0) futures := make([]*conc.Future[struct{}], 0)
@ -904,3 +924,64 @@ func (gc *garbageCollector) recycleUnusedTextIndexFiles(ctx context.Context) {
metrics.GarbageCollectorRunCount.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Add(1) metrics.GarbageCollectorRunCount.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Add(1)
} }
// recycleUnusedJSONIndexFiles load meta file info and compares OSS keys
// if missing found, performs gc cleanup
func (gc *garbageCollector) recycleUnusedJSONIndexFiles(ctx context.Context) {
start := time.Now()
log := log.Ctx(ctx).With(zap.String("gcName", "recycleUnusedJSONIndexFiles"), zap.Time("startAt", start))
log.Info("start recycleUnusedJSONIndexFiles...")
defer func() { log.Info("recycleUnusedJSONIndexFiles done", zap.Duration("timeCost", time.Since(start))) }()
hasJSONIndexSegments := gc.meta.SelectSegments(ctx, SegmentFilterFunc(func(info *SegmentInfo) bool {
return len(info.GetJsonKeyStats()) != 0
}))
fileNum := 0
deletedFilesNum := atomic.NewInt32(0)
for _, seg := range hasJSONIndexSegments {
for _, fieldStats := range seg.GetJsonKeyStats() {
log := log.With(zap.Int64("segmentID", seg.GetID()), zap.Int64("fieldID", fieldStats.GetFieldID()))
// clear low version task
for i := int64(1); i < fieldStats.GetVersion(); i++ {
prefix := fmt.Sprintf("%s/%s/%d/%d/%d/%d/%d/%d", gc.option.cli.RootPath(), common.JSONIndexPath,
fieldStats.GetBuildID(), i, seg.GetCollectionID(), seg.GetPartitionID(), seg.GetID(), fieldStats.GetFieldID())
futures := make([]*conc.Future[struct{}], 0)
err := gc.option.cli.WalkWithPrefix(ctx, prefix, true, func(files *storage.ChunkObjectInfo) bool {
file := files.FilePath
future := gc.option.removeObjectPool.Submit(func() (struct{}, error) {
log := log.With(zap.String("file", file))
log.Info("garbageCollector recycleUnusedJSONIndexFiles remove file...")
if err := gc.option.cli.Remove(ctx, file); err != nil {
log.Warn("garbageCollector recycleUnusedJSONIndexFiles remove file failed", zap.Error(err))
return struct{}{}, err
}
deletedFilesNum.Inc()
log.Info("garbageCollector recycleUnusedJSONIndexFiles remove file success")
return struct{}{}, nil
})
futures = append(futures, future)
return true
})
// Wait for all remove tasks done.
if err := conc.BlockOnAll(futures...); err != nil {
// error is logged, and can be ignored here.
log.Warn("some task failure in remove object pool", zap.Error(err))
}
log = log.With(zap.Int("deleteJSONKeyIndexNum", int(deletedFilesNum.Load())), zap.Int("walkFileNum", fileNum))
if err != nil {
log.Warn("json index files recycle failed when walk with prefix", zap.Error(err))
return
}
}
}
}
log.Info("json index files recycle done")
metrics.GarbageCollectorRunCount.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Add(1)
}

View File

@ -440,7 +440,6 @@ func newSegmentIndexMeta(catalog metastore.DataCoordCatalog) *indexMeta {
} }
func TestMeta_CreateIndex(t *testing.T) { func TestMeta_CreateIndex(t *testing.T) {
indexParams := []*commonpb.KeyValuePair{ indexParams := []*commonpb.KeyValuePair{
{ {
Key: common.IndexTypeKey, Key: common.IndexTypeKey,

View File

@ -73,6 +73,9 @@ func (jm *statsJobManager) triggerStatsTaskLoop() {
ticker := time.NewTicker(Params.DataCoordCfg.TaskCheckInterval.GetAsDuration(time.Second)) ticker := time.NewTicker(Params.DataCoordCfg.TaskCheckInterval.GetAsDuration(time.Second))
defer ticker.Stop() defer ticker.Stop()
lastJSONStatsLastTrigger := time.Now().Unix()
maxJSONStatsTaskCount := 0
for { for {
select { select {
case <-jm.ctx.Done(): case <-jm.ctx.Done():
@ -82,6 +85,7 @@ func (jm *statsJobManager) triggerStatsTaskLoop() {
jm.triggerSortStatsTask() jm.triggerSortStatsTask()
jm.triggerTextStatsTask() jm.triggerTextStatsTask()
jm.triggerBM25StatsTask() jm.triggerBM25StatsTask()
lastJSONStatsLastTrigger, maxJSONStatsTaskCount = jm.triggerJsonKeyIndexStatsTask(lastJSONStatsLastTrigger, maxJSONStatsTaskCount)
case segID := <-getStatsTaskChSingleton(): case segID := <-getStatsTaskChSingleton():
log.Info("receive new segment to trigger stats task", zap.Int64("segmentID", segID)) log.Info("receive new segment to trigger stats task", zap.Int64("segmentID", segID))
@ -141,10 +145,21 @@ func needDoTextIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
} }
for _, fieldID := range fieldIDs { for _, fieldID := range fieldIDs {
if segment.GetTextStatsLogs() == nil { if segment.GetTextStatsLogs()[fieldID] == nil {
return true return true
} }
if segment.GetTextStatsLogs()[fieldID] == nil { }
return false
}
func needDoJsonKeyIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
if !(isFlush(segment) && segment.GetLevel() != datapb.SegmentLevel_L0 &&
segment.GetIsSorted()) {
return false
}
for _, fieldID := range fieldIDs {
if segment.GetJsonKeyStats()[fieldID] == nil {
return true return true
} }
} }
@ -182,6 +197,38 @@ func (jm *statsJobManager) triggerTextStatsTask() {
} }
} }
func (jm *statsJobManager) triggerJsonKeyIndexStatsTask(lastJSONStatsLastTrigger int64, maxJSONStatsTaskCount int) (int64, int) {
collections := jm.mt.GetCollections()
for _, collection := range collections {
needTriggerFieldIDs := make([]UniqueID, 0)
for _, field := range collection.Schema.GetFields() {
h := typeutil.CreateFieldSchemaHelper(field)
if h.EnableJSONKeyStatsIndex() && Params.CommonCfg.EnabledJSONKeyStats.GetAsBool() {
needTriggerFieldIDs = append(needTriggerFieldIDs, field.GetFieldID())
}
}
segments := jm.mt.SelectSegments(jm.ctx, WithCollection(collection.ID), SegmentFilterFunc(func(seg *SegmentInfo) bool {
return needDoJsonKeyIndex(seg, needTriggerFieldIDs)
}))
if time.Now().Unix()-lastJSONStatsLastTrigger > int64(Params.DataCoordCfg.JSONStatsTriggerInterval.GetAsDuration(time.Minute).Seconds()) {
lastJSONStatsLastTrigger = time.Now().Unix()
maxJSONStatsTaskCount = 0
}
for _, segment := range segments {
if maxJSONStatsTaskCount >= Params.DataCoordCfg.JSONStatsTriggerCount.GetAsInt() {
break
}
if err := jm.SubmitStatsTask(segment.GetID(), segment.GetID(), indexpb.StatsSubJob_JsonKeyIndexJob, true); err != nil {
log.Warn("create stats task with json key index for segment failed, wait for retry:",
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
continue
}
maxJSONStatsTaskCount++
}
}
return lastJSONStatsLastTrigger, maxJSONStatsTaskCount
}
func (jm *statsJobManager) triggerBM25StatsTask() { func (jm *statsJobManager) triggerBM25StatsTask() {
collections := jm.mt.GetCollections() collections := jm.mt.GetCollections()
for _, collection := range collections { for _, collection := range collections {

View File

@ -2158,6 +2158,7 @@ func (m *meta) SaveStatsResultSegment(oldSegmentID int64, result *workerpb.Stats
Statslogs: result.GetStatsLogs(), Statslogs: result.GetStatsLogs(),
TextStatsLogs: result.GetTextStatsLogs(), TextStatsLogs: result.GetTextStatsLogs(),
Bm25Statslogs: result.GetBm25Logs(), Bm25Statslogs: result.GetBm25Logs(),
JsonKeyStats: result.GetJsonKeyStatsLogs(),
Deltalogs: nil, Deltalogs: nil,
CompactionFrom: []int64{oldSegmentID}, CompactionFrom: []int64{oldSegmentID},
IsSorted: true, IsSorted: true,

View File

@ -43,6 +43,18 @@ func SetTextIndexLogs(textIndexLogs map[int64]*datapb.TextIndexStats) SegmentOpe
} }
} }
func SetJsonKeyIndexLogs(jsonKeyIndexLogs map[int64]*datapb.JsonKeyStats) SegmentOperator {
return func(segment *SegmentInfo) bool {
if segment.JsonKeyStats == nil {
segment.JsonKeyStats = make(map[int64]*datapb.JsonKeyStats)
}
for field, logs := range jsonKeyIndexLogs {
segment.JsonKeyStats[field] = logs
}
return true
}
}
type segmentCriterion struct { type segmentCriterion struct {
collectionID int64 collectionID int64
channel string channel string

View File

@ -2,7 +2,6 @@ package datacoord
import ( import (
"context" "context"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"testing" "testing"
"time" "time"
@ -33,6 +32,7 @@ import (
"github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/mq/msgstream" "github.com/milvus-io/milvus/pkg/v2/mq/msgstream"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb" "github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"github.com/milvus-io/milvus/pkg/v2/proto/internalpb" "github.com/milvus-io/milvus/pkg/v2/proto/internalpb"
"github.com/milvus-io/milvus/pkg/v2/proto/workerpb" "github.com/milvus-io/milvus/pkg/v2/proto/workerpb"
"github.com/milvus-io/milvus/pkg/v2/util/merr" "github.com/milvus-io/milvus/pkg/v2/util/merr"

View File

@ -244,10 +244,14 @@ func (st *statsTask) PreCheck(ctx context.Context, dependency *taskScheduler) bo
CollectionTtl: collTtl.Nanoseconds(), CollectionTtl: collTtl.Nanoseconds(),
CurrentTs: tsoutil.GetCurrentTime(), CurrentTs: tsoutil.GetCurrentTime(),
// update version after check // update version after check
TaskVersion: statsMeta.GetVersion() + 1, TaskVersion: statsMeta.GetVersion() + 1,
BinlogMaxSize: Params.DataNodeCfg.BinLogMaxSize.GetAsUint64(), BinlogMaxSize: Params.DataNodeCfg.BinLogMaxSize.GetAsUint64(),
StorageVersion: segment.StorageVersion, StorageVersion: segment.StorageVersion,
TaskSlot: st.taskSlot, TaskSlot: st.taskSlot,
EnableJsonKeyStats: Params.CommonCfg.EnabledJSONKeyStats.GetAsBool(),
JsonKeyStatsTantivyMemory: Params.DataCoordCfg.JSONKeyStatsMemoryBudgetInTantivy.GetAsInt64(),
JsonKeyStatsDataFormat: 1,
EnableJsonKeyStatsInSort: Params.DataCoordCfg.EnabledJSONKeyStatsInSort.GetAsBool(),
} }
log.Info("stats task pre check successfully", zap.String("subJobType", st.subJobType.String()), log.Info("stats task pre check successfully", zap.String("subJobType", st.subJobType.String()),
@ -373,6 +377,13 @@ func (st *statsTask) SetJobInfo(meta *meta) error {
zap.Int64("segmentID", st.segmentID), zap.Error(err)) zap.Int64("segmentID", st.segmentID), zap.Error(err))
return err return err
} }
case indexpb.StatsSubJob_JsonKeyIndexJob:
err := meta.UpdateSegment(st.taskInfo.GetSegmentID(), SetJsonKeyIndexLogs(st.taskInfo.GetJsonKeyStatsLogs()))
if err != nil {
log.Warn("save json key index stats result failed", zap.Int64("taskId", st.taskID),
zap.Int64("segmentID", st.segmentID), zap.Error(err))
return err
}
case indexpb.StatsSubJob_BM25Job: case indexpb.StatsSubJob_BM25Job:
// TODO: support bm25 job // TODO: support bm25 job
} }

View File

@ -22,10 +22,9 @@ import (
"testing" "testing"
"time" "time"
"go.uber.org/atomic"
"github.com/stretchr/testify/mock" "github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite" "github.com/stretchr/testify/suite"
"go.uber.org/atomic"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"

View File

@ -312,18 +312,19 @@ func (m *TaskManager) WaitTaskFinish() {
} }
type StatsTaskInfo struct { type StatsTaskInfo struct {
Cancel context.CancelFunc Cancel context.CancelFunc
State indexpb.JobState State indexpb.JobState
FailReason string FailReason string
CollID typeutil.UniqueID CollID typeutil.UniqueID
PartID typeutil.UniqueID PartID typeutil.UniqueID
SegID typeutil.UniqueID SegID typeutil.UniqueID
InsertChannel string InsertChannel string
NumRows int64 NumRows int64
InsertLogs []*datapb.FieldBinlog InsertLogs []*datapb.FieldBinlog
StatsLogs []*datapb.FieldBinlog StatsLogs []*datapb.FieldBinlog
TextStatsLogs map[int64]*datapb.TextIndexStats TextStatsLogs map[int64]*datapb.TextIndexStats
Bm25Logs []*datapb.FieldBinlog Bm25Logs []*datapb.FieldBinlog
JSONKeyStatsLogs map[int64]*datapb.JsonKeyStats
} }
func (m *TaskManager) LoadOrStoreStatsTask(clusterID string, taskID typeutil.UniqueID, info *StatsTaskInfo) *StatsTaskInfo { func (m *TaskManager) LoadOrStoreStatsTask(clusterID string, taskID typeutil.UniqueID, info *StatsTaskInfo) *StatsTaskInfo {
@ -410,24 +411,46 @@ func (m *TaskManager) StoreStatsTextIndexResult(
} }
} }
func (m *TaskManager) StoreJSONKeyStatsResult(
clusterID string,
taskID typeutil.UniqueID,
collID typeutil.UniqueID,
partID typeutil.UniqueID,
segID typeutil.UniqueID,
channel string,
jsonKeyIndexLogs map[int64]*datapb.JsonKeyStats,
) {
key := Key{ClusterID: clusterID, TaskID: taskID}
m.stateLock.Lock()
defer m.stateLock.Unlock()
if info, ok := m.statsTasks[key]; ok {
info.JSONKeyStatsLogs = jsonKeyIndexLogs
info.SegID = segID
info.CollID = collID
info.PartID = partID
info.InsertChannel = channel
}
}
func (m *TaskManager) GetStatsTaskInfo(clusterID string, taskID typeutil.UniqueID) *StatsTaskInfo { func (m *TaskManager) GetStatsTaskInfo(clusterID string, taskID typeutil.UniqueID) *StatsTaskInfo {
m.stateLock.Lock() m.stateLock.Lock()
defer m.stateLock.Unlock() defer m.stateLock.Unlock()
if info, ok := m.statsTasks[Key{ClusterID: clusterID, TaskID: taskID}]; ok { if info, ok := m.statsTasks[Key{ClusterID: clusterID, TaskID: taskID}]; ok {
return &StatsTaskInfo{ return &StatsTaskInfo{
Cancel: info.Cancel, Cancel: info.Cancel,
State: info.State, State: info.State,
FailReason: info.FailReason, FailReason: info.FailReason,
CollID: info.CollID, CollID: info.CollID,
PartID: info.PartID, PartID: info.PartID,
SegID: info.SegID, SegID: info.SegID,
InsertChannel: info.InsertChannel, InsertChannel: info.InsertChannel,
NumRows: info.NumRows, NumRows: info.NumRows,
InsertLogs: info.InsertLogs, InsertLogs: info.InsertLogs,
StatsLogs: info.StatsLogs, StatsLogs: info.StatsLogs,
TextStatsLogs: info.TextStatsLogs, TextStatsLogs: info.TextStatsLogs,
Bm25Logs: info.Bm25Logs, Bm25Logs: info.Bm25Logs,
JSONKeyStatsLogs: info.JSONKeyStatsLogs,
} }
} }
return nil return nil

View File

@ -93,6 +93,20 @@ func (s *statsTaskInfoSuite) Test_Methods() {
}) })
}) })
s.Run("storeStatsJsonIndexResult", func() {
s.manager.StoreJSONKeyStatsResult(s.cluster, s.taskID, 1, 2, 3, "ch1",
map[int64]*datapb.JsonKeyStats{
100: {
FieldID: 100,
Version: 1,
Files: []string{"file1"},
LogSize: 1024,
MemorySize: 1024,
JsonKeyStatsDataFormat: 1,
},
})
})
s.Run("getStatsTaskInfo", func() { s.Run("getStatsTaskInfo", func() {
taskInfo := s.manager.GetStatsTaskInfo(s.cluster, s.taskID) taskInfo := s.manager.GetStatsTaskInfo(s.cluster, s.taskID)

View File

@ -38,12 +38,14 @@ import (
"github.com/milvus-io/milvus/internal/util/indexcgowrapper" "github.com/milvus-io/milvus/internal/util/indexcgowrapper"
"github.com/milvus-io/milvus/pkg/v2/common" "github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/metrics"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb" "github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexcgopb" "github.com/milvus-io/milvus/pkg/v2/proto/indexcgopb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb" "github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"github.com/milvus-io/milvus/pkg/v2/proto/workerpb" "github.com/milvus-io/milvus/pkg/v2/proto/workerpb"
_ "github.com/milvus-io/milvus/pkg/v2/util/funcutil" _ "github.com/milvus-io/milvus/pkg/v2/util/funcutil"
"github.com/milvus-io/milvus/pkg/v2/util/metautil" "github.com/milvus-io/milvus/pkg/v2/util/metautil"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/timerecord" "github.com/milvus-io/milvus/pkg/v2/util/timerecord"
"github.com/milvus-io/milvus/pkg/v2/util/tsoutil" "github.com/milvus-io/milvus/pkg/v2/util/tsoutil"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil" "github.com/milvus-io/milvus/pkg/v2/util/typeutil"
@ -311,6 +313,26 @@ func (st *statsTask) Execute(ctx context.Context) error {
return err return err
} }
} }
if (st.req.EnableJsonKeyStatsInSort && st.req.GetSubJobType() == indexpb.StatsSubJob_Sort) || st.req.GetSubJobType() == indexpb.StatsSubJob_JsonKeyIndexJob {
if !st.req.GetEnableJsonKeyStats() {
return nil
}
err = st.createJSONKeyStats(ctx,
st.req.GetStorageConfig(),
st.req.GetCollectionID(),
st.req.GetPartitionID(),
st.req.GetTargetSegmentID(),
st.req.GetTaskVersion(),
st.req.GetTaskID(),
st.req.GetJsonKeyStatsTantivyMemory(),
st.req.GetJsonKeyStatsDataFormat(),
insertLogs)
if err != nil {
log.Warn("stats wrong, failed to create json index", zap.Error(err))
return err
}
}
return nil return nil
} }
@ -466,3 +488,108 @@ func (st *statsTask) createTextIndex(ctx context.Context,
textIndexLogs) textIndexLogs)
return nil return nil
} }
func (st *statsTask) createJSONKeyStats(ctx context.Context,
storageConfig *indexpb.StorageConfig,
collectionID int64,
partitionID int64,
segmentID int64,
version int64,
taskID int64,
tantivyMemory int64,
jsonKeyStatsDataFormat int64,
insertBinlogs []*datapb.FieldBinlog,
) error {
log := log.Ctx(ctx).With(
zap.String("clusterID", st.req.GetClusterID()),
zap.Int64("taskID", st.req.GetTaskID()),
zap.Int64("collectionID", st.req.GetCollectionID()),
zap.Int64("partitionID", st.req.GetPartitionID()),
zap.Int64("segmentID", st.req.GetSegmentID()),
zap.Any("statsJobType", st.req.GetSubJobType()),
zap.Int64("jsonKeyStatsDataFormat", jsonKeyStatsDataFormat),
)
if jsonKeyStatsDataFormat != 1 {
log.Info("create json key index failed dataformat invalid")
return nil
}
fieldBinlogs := lo.GroupBy(insertBinlogs, func(binlog *datapb.FieldBinlog) int64 {
return binlog.GetFieldID()
})
getInsertFiles := func(fieldID int64) ([]string, error) {
binlogs, ok := fieldBinlogs[fieldID]
if !ok {
return nil, fmt.Errorf("field binlog not found for field %d", fieldID)
}
result := make([]string, 0, len(binlogs))
for _, binlog := range binlogs {
for _, file := range binlog.GetBinlogs() {
result = append(result, metautil.BuildInsertLogPath(storageConfig.GetRootPath(), collectionID, partitionID, segmentID, fieldID, file.GetLogID()))
}
}
return result, nil
}
newStorageConfig, err := ParseStorageConfig(storageConfig)
if err != nil {
return err
}
jsonKeyIndexStats := make(map[int64]*datapb.JsonKeyStats)
for _, field := range st.req.GetSchema().GetFields() {
h := typeutil.CreateFieldSchemaHelper(field)
if !h.EnableJSONKeyStatsIndex() {
continue
}
log.Info("field enable json key index, ready to create json key index", zap.Int64("field id", field.GetFieldID()))
files, err := getInsertFiles(field.GetFieldID())
if err != nil {
return err
}
buildIndexParams := &indexcgopb.BuildIndexInfo{
BuildID: taskID,
CollectionID: collectionID,
PartitionID: partitionID,
SegmentID: segmentID,
IndexVersion: version,
InsertFiles: files,
FieldSchema: field,
StorageConfig: newStorageConfig,
JsonKeyStatsTantivyMemory: tantivyMemory,
}
uploaded, err := indexcgowrapper.CreateJSONKeyStats(ctx, buildIndexParams)
if err != nil {
return err
}
jsonKeyIndexStats[field.GetFieldID()] = &datapb.JsonKeyStats{
FieldID: field.GetFieldID(),
Version: version,
BuildID: taskID,
Files: lo.Keys(uploaded),
JsonKeyStatsDataFormat: jsonKeyStatsDataFormat,
}
log.Info("field enable json key index, create json key index done",
zap.Int64("field id", field.GetFieldID()),
zap.Strings("files", lo.Keys(uploaded)),
)
}
totalElapse := st.tr.RecordSpan()
st.manager.StoreJSONKeyStatsResult(st.req.GetClusterID(),
st.req.GetTaskID(),
st.req.GetCollectionID(),
st.req.GetPartitionID(),
st.req.GetTargetSegmentID(),
st.req.GetInsertChannel(),
jsonKeyIndexStats)
metrics.DataNodeBuildJSONStatsLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(totalElapse.Seconds())
log.Info("create json key index done",
zap.Int64("target segmentID", st.req.GetTargetSegmentID()),
zap.Duration("total elapse", totalElapse))
return nil
}

View File

@ -29,12 +29,12 @@ import "C"
import ( import (
"github.com/cockroachdb/errors" "github.com/cockroachdb/errors"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/common" "github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/util/hardware" "github.com/milvus-io/milvus/pkg/v2/util/hardware"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
) )
func getCurrentIndexVersion(v int32) int32 { func getCurrentIndexVersion(v int32) int32 {

View File

@ -462,18 +462,19 @@ func (node *DataNode) QueryJobsV2(ctx context.Context, req *workerpb.QueryJobsV2
info := node.taskManager.GetStatsTaskInfo(req.GetClusterID(), taskID) info := node.taskManager.GetStatsTaskInfo(req.GetClusterID(), taskID)
if info != nil { if info != nil {
results = append(results, &workerpb.StatsResult{ results = append(results, &workerpb.StatsResult{
TaskID: taskID, TaskID: taskID,
State: info.State, State: info.State,
FailReason: info.FailReason, FailReason: info.FailReason,
CollectionID: info.CollID, CollectionID: info.CollID,
PartitionID: info.PartID, PartitionID: info.PartID,
SegmentID: info.SegID, SegmentID: info.SegID,
Channel: info.InsertChannel, Channel: info.InsertChannel,
InsertLogs: info.InsertLogs, InsertLogs: info.InsertLogs,
StatsLogs: info.StatsLogs, StatsLogs: info.StatsLogs,
TextStatsLogs: info.TextStatsLogs, TextStatsLogs: info.TextStatsLogs,
Bm25Logs: info.Bm25Logs, Bm25Logs: info.Bm25Logs,
NumRows: info.NumRows, NumRows: info.NumRows,
JsonKeyStatsLogs: info.JSONKeyStatsLogs,
}) })
} }
} }

View File

@ -540,22 +540,23 @@ func (s *IndexServiceSuite) Test_CreateStatsTask() {
s.Run("normal case", func() { s.Run("normal case", func() {
taskID := int64(100) taskID := int64(100)
req := &workerpb.CreateStatsRequest{ req := &workerpb.CreateStatsRequest{
ClusterID: "cluster2", ClusterID: "cluster2",
TaskID: taskID, TaskID: taskID,
CollectionID: s.collID, CollectionID: s.collID,
PartitionID: s.partID, PartitionID: s.partID,
InsertChannel: "ch1", InsertChannel: "ch1",
SegmentID: s.segID, SegmentID: s.segID,
InsertLogs: fieldBinlogs, InsertLogs: fieldBinlogs,
DeltaLogs: nil, DeltaLogs: nil,
StorageConfig: s.storageConfig, StorageConfig: s.storageConfig,
Schema: generateTestSchema(), Schema: generateTestSchema(),
TargetSegmentID: s.segID + 1, TargetSegmentID: s.segID + 1,
StartLogID: s.logID + 100, StartLogID: s.logID + 100,
EndLogID: s.logID + 200, EndLogID: s.logID + 200,
NumRows: s.numRows, NumRows: s.numRows,
BinlogMaxSize: 131000, BinlogMaxSize: 131000,
SubJobType: indexpb.StatsSubJob_Sort, SubJobType: indexpb.StatsSubJob_Sort,
EnableJsonKeyStats: false,
} }
status, err := s.in.CreateJobV2(ctx, &workerpb.CreateJobV2Request{ status, err := s.in.CreateJobV2(ctx, &workerpb.CreateJobV2Request{

View File

@ -1240,7 +1240,7 @@ func GenSimpleRetrievePlan(collection *segcore.CCollection) (*segcore.RetrievePl
return nil, err return nil, err
} }
plan, err2 := segcore.NewRetrievePlan(collection, planBytes, timestamp, 100) plan, err2 := segcore.NewRetrievePlan(collection, planBytes, timestamp, 100, 0)
return plan, err2 return plan, err2
} }

View File

@ -3825,7 +3825,8 @@ func (node *Proxy) Query(ctx context.Context, request *milvuspb.QueryRequest) (*
commonpbutil.WithMsgType(commonpb.MsgType_Retrieve), commonpbutil.WithMsgType(commonpb.MsgType_Retrieve),
commonpbutil.WithSourceID(paramtable.GetNodeID()), commonpbutil.WithSourceID(paramtable.GetNodeID()),
), ),
ReqID: paramtable.GetNodeID(), ReqID: paramtable.GetNodeID(),
ConsistencyLevel: request.ConsistencyLevel,
}, },
request: request, request: request,
qc: node.queryCoord, qc: node.queryCoord,

View File

@ -599,7 +599,7 @@ func (t *queryTask) queryShard(ctx context.Context, nodeID int64, qn types.Query
retrieveReq.MvccTimestamp = mvccTs retrieveReq.MvccTimestamp = mvccTs
retrieveReq.GuaranteeTimestamp = mvccTs retrieveReq.GuaranteeTimestamp = mvccTs
} }
retrieveReq.ConsistencyLevel = t.ConsistencyLevel
req := &querypb.QueryRequest{ req := &querypb.QueryRequest{
Req: retrieveReq, Req: retrieveReq,
DmlChannels: []string{channel}, DmlChannels: []string{channel},

View File

@ -969,8 +969,9 @@ func (t *searchTask) Requery(span trace.Span) error {
commonpbutil.WithMsgType(commonpb.MsgType_Retrieve), commonpbutil.WithMsgType(commonpb.MsgType_Retrieve),
commonpbutil.WithSourceID(paramtable.GetNodeID()), commonpbutil.WithSourceID(paramtable.GetNodeID()),
), ),
ReqID: paramtable.GetNodeID(), ReqID: paramtable.GetNodeID(),
PartitionIDs: t.GetPartitionIDs(), // use search partitionIDs PartitionIDs: t.GetPartitionIDs(), // use search partitionIDs
ConsistencyLevel: t.ConsistencyLevel,
}, },
request: queryReq, request: queryReq,
plan: plan, plan: plan,

View File

@ -23,6 +23,7 @@ import (
"github.com/samber/lo" "github.com/samber/lo"
"go.uber.org/zap" "go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/params" "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session" "github.com/milvus-io/milvus/internal/querycoordv2/session"
@ -31,6 +32,7 @@ import (
"github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb" "github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb" "github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil" "github.com/milvus-io/milvus/pkg/v2/util/typeutil"
) )
@ -89,20 +91,28 @@ func (c *IndexChecker) Check(ctx context.Context) []task.Task {
} }
collection := c.meta.CollectionManager.GetCollection(ctx, collectionID) collection := c.meta.CollectionManager.GetCollection(ctx, collectionID)
schema := c.meta.CollectionManager.GetCollectionSchema(ctx, collectionID)
if collection == nil { if collection == nil {
log.Warn("collection released during check index", zap.Int64("collection", collectionID)) log.Warn("collection released during check index", zap.Int64("collection", collectionID))
continue continue
} }
if schema == nil && paramtable.Get().CommonCfg.EnabledJSONKeyStats.GetAsBool() {
collectionSchema, err1 := c.broker.DescribeCollection(ctx, collectionID)
if err1 == nil {
schema = collectionSchema.GetSchema()
c.meta.PutCollectionSchema(ctx, collectionID, collectionSchema.GetSchema())
}
}
replicas := c.meta.ReplicaManager.GetByCollection(ctx, collectionID) replicas := c.meta.ReplicaManager.GetByCollection(ctx, collectionID)
for _, replica := range replicas { for _, replica := range replicas {
tasks = append(tasks, c.checkReplica(ctx, collection, replica, indexInfos)...) tasks = append(tasks, c.checkReplica(ctx, collection, replica, indexInfos, schema)...)
} }
} }
return tasks return tasks
} }
func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collection, replica *meta.Replica, indexInfos []*indexpb.IndexInfo) []task.Task { func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collection, replica *meta.Replica, indexInfos []*indexpb.IndexInfo, schema *schemapb.CollectionSchema) []task.Task {
log := log.Ctx(ctx).With( log := log.Ctx(ctx).With(
zap.Int64("collectionID", collection.GetCollectionID()), zap.Int64("collectionID", collection.GetCollectionID()),
) )
@ -113,6 +123,9 @@ func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collec
roNodeSet := typeutil.NewUniqueSet(replica.GetRONodes()...) roNodeSet := typeutil.NewUniqueSet(replica.GetRONodes()...)
targets := make(map[int64][]int64) // segmentID => FieldID targets := make(map[int64][]int64) // segmentID => FieldID
idSegmentsStats := make(map[int64]*meta.Segment)
targetsStats := make(map[int64][]int64) // segmentID => FieldID
for _, segment := range segments { for _, segment := range segments {
// skip update index in read only node // skip update index in read only node
if roNodeSet.Contain(segment.Node) { if roNodeSet.Contain(segment.Node) {
@ -120,9 +133,13 @@ func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collec
} }
missing := c.checkSegment(segment, indexInfos) missing := c.checkSegment(segment, indexInfos)
missingStats := c.checkSegmentStats(segment, schema, collection.LoadFields)
if len(missing) > 0 { if len(missing) > 0 {
targets[segment.GetID()] = missing targets[segment.GetID()] = missing
idSegments[segment.GetID()] = segment idSegments[segment.GetID()] = segment
} else if len(missingStats) > 0 {
targetsStats[segment.GetID()] = missingStats
idSegmentsStats[segment.GetID()] = segment
} }
} }
@ -150,6 +167,29 @@ func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collec
return c.createSegmentUpdateTask(ctx, idSegments[segmentID], replica) return c.createSegmentUpdateTask(ctx, idSegments[segmentID], replica)
}) })
segmentsStatsToUpdate := typeutil.NewSet[int64]()
for _, segmentIDs := range lo.Chunk(lo.Keys(idSegmentsStats), MaxSegmentNumPerGetIndexInfoRPC) {
segmentInfos, err := c.broker.GetSegmentInfo(ctx, segmentIDs...)
if err != nil {
log.Warn("failed to get SegmentInfo for segments", zap.Int64s("segmentIDs", segmentIDs), zap.Error(err))
continue
}
for _, segmentInfo := range segmentInfos {
fields := targetsStats[segmentInfo.ID]
missingFields := typeutil.NewSet(fields...)
for field := range segmentInfo.GetJsonKeyStats() {
if missingFields.Contain(field) {
segmentsStatsToUpdate.Insert(segmentInfo.ID)
}
}
}
}
tasksStats := lo.FilterMap(segmentsStatsToUpdate.Collect(), func(segmentID int64, _ int) (task.Task, bool) {
return c.createSegmentStatsUpdateTask(ctx, idSegmentsStats[segmentID], replica)
})
tasks = append(tasks, tasksStats...)
return tasks return tasks
} }
@ -193,3 +233,58 @@ func (c *IndexChecker) createSegmentUpdateTask(ctx context.Context, segment *met
t.SetReason("missing index") t.SetReason("missing index")
return t, true return t, true
} }
func (c *IndexChecker) checkSegmentStats(segment *meta.Segment, schema *schemapb.CollectionSchema, loadField []int64) (missFieldIDs []int64) {
var result []int64
if paramtable.Get().CommonCfg.EnabledJSONKeyStats.GetAsBool() {
if schema == nil {
log.Warn("schema released during check index", zap.Int64("collection", segment.GetCollectionID()))
return result
}
loadFieldMap := make(map[int64]struct{})
for _, v := range loadField {
loadFieldMap[v] = struct{}{}
}
jsonStatsFieldMap := make(map[int64]struct{})
for _, v := range segment.JSONIndexField {
jsonStatsFieldMap[v] = struct{}{}
}
for _, field := range schema.GetFields() {
// Check if the field exists in both loadFieldMap and jsonStatsFieldMap
h := typeutil.CreateFieldSchemaHelper(field)
if h.EnableJSONKeyStatsIndex() {
if _, ok := loadFieldMap[field.FieldID]; ok {
if _, ok := jsonStatsFieldMap[field.FieldID]; !ok {
result = append(result, field.FieldID)
}
}
}
}
}
return result
}
func (c *IndexChecker) createSegmentStatsUpdateTask(ctx context.Context, segment *meta.Segment, replica *meta.Replica) (task.Task, bool) {
action := task.NewSegmentActionWithScope(segment.Node, task.ActionTypeStatsUpdate, segment.GetInsertChannel(), segment.GetID(), querypb.DataScope_Historical, int(segment.GetNumOfRows()))
t, err := task.NewSegmentTask(
ctx,
params.Params.QueryCoordCfg.SegmentTaskTimeout.GetAsDuration(time.Millisecond),
c.ID(),
segment.GetCollectionID(),
replica,
action,
)
if err != nil {
log.Warn("create segment stats update task failed",
zap.Int64("collection", segment.GetCollectionID()),
zap.String("channel", segment.GetInsertChannel()),
zap.Int64("node", segment.Node),
zap.Error(err),
)
return nil, false
}
t.SetPriority(task.TaskPriorityLow)
t.SetReason("missing json stats")
return t, true
}

View File

@ -24,6 +24,7 @@ import (
"github.com/stretchr/testify/mock" "github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite" "github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd" etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/metastore/kv/querycoord" "github.com/milvus-io/milvus/internal/metastore/kv/querycoord"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
@ -97,6 +98,12 @@ func (suite *IndexCheckerSuite) TestLoadIndex() {
// meta // meta
coll := utils.CreateTestCollection(1, 1) coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000} coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll) checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2})) checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
@ -133,6 +140,8 @@ func (suite *IndexCheckerSuite) TestLoadIndex() {
}, },
}, nil) }, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{}, nil).Maybe()
tasks := checker.Check(context.Background()) tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 1) suite.Require().Len(tasks, 1)
@ -162,6 +171,12 @@ func (suite *IndexCheckerSuite) TestIndexInfoNotMatch() {
// meta // meta
coll := utils.CreateTestCollection(1, 1) coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000} coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll) checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2})) checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
@ -211,7 +226,8 @@ func (suite *IndexCheckerSuite) TestIndexInfoNotMatch() {
IndexID: 1000, IndexID: 1000,
}, },
}, nil) }, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{}, nil).Maybe()
tasks := checker.Check(context.Background()) tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 0) suite.Require().Len(tasks, 0)
} }
@ -223,6 +239,12 @@ func (suite *IndexCheckerSuite) TestGetIndexInfoFailed() {
// meta // meta
coll := utils.CreateTestCollection(1, 1) coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000} coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll) checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2})) checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
@ -251,7 +273,8 @@ func (suite *IndexCheckerSuite) TestGetIndexInfoFailed() {
IndexID: 1000, IndexID: 1000,
}, },
}, nil) }, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{}, nil).Maybe()
tasks := checker.Check(context.Background()) tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 0) suite.Require().Len(tasks, 0)
} }
@ -263,6 +286,12 @@ func (suite *IndexCheckerSuite) TestCreateNewIndex() {
// meta // meta
coll := utils.CreateTestCollection(1, 1) coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000} coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll) checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2})) checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
@ -317,13 +346,234 @@ func (suite *IndexCheckerSuite) TestCreateNewIndex() {
IndexFilePaths: []string{"index"}, IndexFilePaths: []string{"index"},
}, },
}}, nil) }}, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{}, nil).Maybe()
tasks := checker.Check(context.Background()) tasks := checker.Check(context.Background())
suite.Len(tasks, 1) suite.Len(tasks, 1)
suite.Len(tasks[0].Actions(), 1) suite.Len(tasks[0].Actions(), 1)
suite.Equal(tasks[0].Actions()[0].(*task.SegmentAction).Type(), task.ActionTypeUpdate) suite.Equal(tasks[0].Actions()[0].(*task.SegmentAction).Type(), task.ActionTypeUpdate)
} }
func (suite *IndexCheckerSuite) TestLoadJsonIndex() {
checker := suite.checker
ctx := context.Background()
paramtable.Get().Save(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key, "true")
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key)
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
coll.LoadFields = []int64{101}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 2,
Address: "localhost",
Hostname: "localhost",
}))
checker.meta.ResourceManager.HandleNodeUp(ctx, 1)
checker.meta.ResourceManager.HandleNodeUp(ctx, 2)
// dist
fieldIndexInfo := &querypb.FieldIndexInfo{
FieldID: 101,
IndexID: 1000,
EnableIndex: true,
}
indexInfo := make(map[int64]*querypb.FieldIndexInfo)
indexInfo[fieldIndexInfo.IndexID] = fieldIndexInfo
segment := utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel")
segment.IndexInfo = indexInfo
checker.dist.SegmentDistManager.Update(1, segment)
// broker
suite.broker.EXPECT().ListIndexes(mock.Anything, mock.Anything).Call.Return(
func(ctx context.Context, collectionID int64) ([]*indexpb.IndexInfo, error) {
return []*indexpb.IndexInfo{
{
FieldID: 101,
IndexID: 1000,
},
}, nil
},
)
mockJSONKeyStats := map[int64]*datapb.JsonKeyStats{
101: {
FieldID: 101,
},
}
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{
{
ID: 2,
JsonKeyStats: mockJSONKeyStats,
},
}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 1)
t := tasks[0]
suite.Require().Len(t.Actions(), 1)
action, ok := t.Actions()[0].(*task.SegmentAction)
suite.Require().True(ok)
suite.EqualValues(200, t.ReplicaID())
suite.Equal(task.ActionTypeStatsUpdate, action.Type())
suite.EqualValues(2, action.GetSegmentID())
// test skip load json index for read only node
suite.nodeMgr.Stopping(1)
suite.nodeMgr.Stopping(2)
suite.meta.ResourceManager.HandleNodeStopping(ctx, 1)
suite.meta.ResourceManager.HandleNodeStopping(ctx, 2)
utils.RecoverAllCollection(suite.meta)
tasks = checker.Check(context.Background())
suite.Require().Len(tasks, 0)
}
func (suite *IndexCheckerSuite) TestJsonIndexNotMatch() {
checker := suite.checker
ctx := context.Background()
paramtable.Get().Save(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key, "true")
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key)
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 2,
Address: "localhost",
Hostname: "localhost",
}))
checker.meta.ResourceManager.HandleNodeUp(ctx, 1)
checker.meta.ResourceManager.HandleNodeUp(ctx, 2)
// dist
checker.dist.SegmentDistManager.Update(1, utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel"))
// broker
suite.broker.EXPECT().ListIndexes(mock.Anything, mock.Anything).Call.Return(
func(ctx context.Context, collectionID int64) ([]*indexpb.IndexInfo, error) {
return []*indexpb.IndexInfo{
{
FieldID: 101,
IndexID: 1000,
},
}, nil
},
)
suite.broker.EXPECT().GetIndexInfo(mock.Anything, mock.Anything, mock.AnythingOfType("int64")).
Return(map[int64][]*querypb.FieldIndexInfo{2: {
{
FieldID: 101,
IndexID: 1000,
EnableIndex: false,
IndexFilePaths: []string{"index"},
},
}}, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{
{},
}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 0)
}
func (suite *IndexCheckerSuite) TestCreateNewJsonIndex() {
checker := suite.checker
ctx := context.Background()
paramtable.Get().Save(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key, "true")
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key)
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.LoadFields = []int64{101}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 2,
Address: "localhost",
Hostname: "localhost",
}))
checker.meta.ResourceManager.HandleNodeUp(ctx, 1)
checker.meta.ResourceManager.HandleNodeUp(ctx, 2)
// dist
fieldIndexInfo := &querypb.FieldIndexInfo{
FieldID: 101,
IndexID: 1000,
EnableIndex: true,
}
indexInfo := make(map[int64]*querypb.FieldIndexInfo)
indexInfo[fieldIndexInfo.IndexID] = fieldIndexInfo
segment := utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel")
segment.IndexInfo = indexInfo
checker.dist.SegmentDistManager.Update(1, segment)
// broker
suite.broker.EXPECT().ListIndexes(mock.Anything, mock.Anything).Call.Return(
func(ctx context.Context, collectionID int64) ([]*indexpb.IndexInfo, error) {
return []*indexpb.IndexInfo{
{
FieldID: 101,
IndexID: 1000,
},
}, nil
},
)
mockJSONKeyStats := map[int64]*datapb.JsonKeyStats{
101: {
FieldID: 101,
},
}
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{
{
ID: 2,
JsonKeyStats: mockJSONKeyStats,
},
}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Len(tasks, 1)
suite.Len(tasks[0].Actions(), 1)
suite.Equal(tasks[0].Actions()[0].(*task.SegmentAction).Type(), task.ActionTypeStatsUpdate)
}
func TestIndexChecker(t *testing.T) { func TestIndexChecker(t *testing.T) {
suite.Run(t, new(IndexCheckerSuite)) suite.Run(t, new(IndexCheckerSuite))
} }

View File

@ -173,6 +173,7 @@ func (dh *distHandler) updateSegmentsDistribution(ctx context.Context, resp *que
Version: s.GetVersion(), Version: s.GetVersion(),
LastDeltaTimestamp: s.GetLastDeltaTimestamp(), LastDeltaTimestamp: s.GetLastDeltaTimestamp(),
IndexInfo: s.GetIndexInfo(), IndexInfo: s.GetIndexInfo(),
JSONIndexField: s.GetFieldJsonIndexStats(),
}) })
} }

View File

@ -222,6 +222,7 @@ func (job *LoadCollectionJob) Execute() error {
}, },
CreatedAt: time.Now(), CreatedAt: time.Now(),
LoadSpan: sp, LoadSpan: sp,
Schema: job.collInfo.GetSchema(),
} }
job.undo.IsNewCollection = true job.undo.IsNewCollection = true
err = job.meta.CollectionManager.PutCollection(job.ctx, collection, partitions...) err = job.meta.CollectionManager.PutCollection(job.ctx, collection, partitions...)
@ -426,6 +427,7 @@ func (job *LoadPartitionJob) Execute() error {
}, },
CreatedAt: time.Now(), CreatedAt: time.Now(),
LoadSpan: sp, LoadSpan: sp,
Schema: job.collInfo.GetSchema(),
} }
err = job.meta.CollectionManager.PutCollection(job.ctx, collection, partitions...) err = job.meta.CollectionManager.PutCollection(job.ctx, collection, partitions...)
if err != nil { if err != nil {

View File

@ -50,6 +50,7 @@ type Collection struct {
mut sync.RWMutex mut sync.RWMutex
refreshNotifier chan struct{} refreshNotifier chan struct{}
LoadSpan trace.Span LoadSpan trace.Span
Schema *schemapb.CollectionSchema
} }
func (collection *Collection) SetRefreshNotifier(notifier chan struct{}) { func (collection *Collection) SetRefreshNotifier(notifier chan struct{}) {
@ -85,6 +86,7 @@ func (collection *Collection) Clone() *Collection {
UpdatedAt: collection.UpdatedAt, UpdatedAt: collection.UpdatedAt,
refreshNotifier: collection.refreshNotifier, refreshNotifier: collection.refreshNotifier,
LoadSpan: collection.LoadSpan, LoadSpan: collection.LoadSpan,
Schema: collection.Schema,
} }
} }
@ -238,6 +240,7 @@ func (m *CollectionManager) upgradeLoadFields(ctx context.Context, collection *q
err = m.putCollection(ctx, true, &Collection{ err = m.putCollection(ctx, true, &Collection{
CollectionLoadInfo: collection, CollectionLoadInfo: collection,
LoadPercentage: 100, LoadPercentage: 100,
Schema: resp.GetSchema(),
}) })
if err != nil { if err != nil {
return err return err
@ -253,6 +256,27 @@ func (m *CollectionManager) GetCollection(ctx context.Context, collectionID type
return m.collections[collectionID] return m.collections[collectionID]
} }
func (m *CollectionManager) GetCollectionSchema(ctx context.Context, collectionID typeutil.UniqueID) *schemapb.CollectionSchema {
m.rwmutex.RLock()
defer m.rwmutex.RUnlock()
collection, ok := m.collections[collectionID]
if !ok {
return nil
}
return collection.Schema
}
func (m *CollectionManager) PutCollectionSchema(ctx context.Context, collectionID typeutil.UniqueID, schema *schemapb.CollectionSchema) {
m.rwmutex.Lock()
defer m.rwmutex.Unlock()
collection, ok := m.collections[collectionID]
if !ok {
return
}
collection.Schema = schema
}
func (m *CollectionManager) GetPartition(ctx context.Context, partitionID typeutil.UniqueID) *Partition { func (m *CollectionManager) GetPartition(ctx context.Context, partitionID typeutil.UniqueID) *Partition {
m.rwmutex.RLock() m.rwmutex.RLock()
defer m.rwmutex.RUnlock() defer m.rwmutex.RUnlock()

View File

@ -125,6 +125,7 @@ type Segment struct {
Version int64 // Version is the timestamp of loading segment Version int64 // Version is the timestamp of loading segment
LastDeltaTimestamp uint64 // The timestamp of the last delta record LastDeltaTimestamp uint64 // The timestamp of the last delta record
IndexInfo map[int64]*querypb.FieldIndexInfo // index info of loaded segment, indexID -> FieldIndexInfo IndexInfo map[int64]*querypb.FieldIndexInfo // index info of loaded segment, indexID -> FieldIndexInfo
JSONIndexField []int64 // json index info of loaded segment
} }
func SegmentFromInfo(info *datapb.SegmentInfo) *Segment { func SegmentFromInfo(info *datapb.SegmentInfo) *Segment {

View File

@ -33,12 +33,14 @@ const (
ActionTypeGrow ActionType = iota + 1 ActionTypeGrow ActionType = iota + 1
ActionTypeReduce ActionTypeReduce
ActionTypeUpdate ActionTypeUpdate
ActionTypeStatsUpdate
) )
var ActionTypeName = map[ActionType]string{ var ActionTypeName = map[ActionType]string{
ActionTypeGrow: "Grow", ActionTypeGrow: "Grow",
ActionTypeReduce: "Reduce", ActionTypeReduce: "Reduce",
ActionTypeUpdate: "Update", ActionTypeUpdate: "Update",
ActionTypeStatsUpdate: "StatsUpdate",
} }
func (t ActionType) String() string { func (t ActionType) String() string {

View File

@ -156,7 +156,7 @@ func (ex *Executor) removeTask(task Task, step int) {
func (ex *Executor) executeSegmentAction(task *SegmentTask, step int) { func (ex *Executor) executeSegmentAction(task *SegmentTask, step int) {
switch task.Actions()[step].Type() { switch task.Actions()[step].Type() {
case ActionTypeGrow, ActionTypeUpdate: case ActionTypeGrow, ActionTypeUpdate, ActionTypeStatsUpdate:
ex.loadSegment(task, step) ex.loadSegment(task, step)
case ActionTypeReduce: case ActionTypeReduce:
@ -469,6 +469,9 @@ func (ex *Executor) executeLeaderAction(task *LeaderTask, step int) {
case ActionTypeUpdate: case ActionTypeUpdate:
ex.updatePartStatsVersions(task, step) ex.updatePartStatsVersions(task, step)
case ActionTypeStatsUpdate:
ex.updatePartStatsVersions(task, step)
} }
} }

View File

@ -49,13 +49,15 @@ const (
TaskTypeReduce TaskTypeReduce
TaskTypeMove TaskTypeMove
TaskTypeUpdate TaskTypeUpdate
TaskTypeStatsUpdate
) )
var TaskTypeName = map[Type]string{ var TaskTypeName = map[Type]string{
TaskTypeGrow: "Grow", TaskTypeGrow: "Grow",
TaskTypeReduce: "Reduce", TaskTypeReduce: "Reduce",
TaskTypeMove: "Move", TaskTypeMove: "Move",
TaskTypeUpdate: "Update", TaskTypeUpdate: "Update",
TaskTypeStatsUpdate: "StatsUpdate",
} }
type Type int32 type Type int32

View File

@ -95,6 +95,8 @@ func GetTaskType(task Task) Type {
return TaskTypeReduce return TaskTypeReduce
case task.Actions()[0].Type() == ActionTypeUpdate: case task.Actions()[0].Type() == ActionTypeUpdate:
return TaskTypeUpdate return TaskTypeUpdate
case task.Actions()[0].Type() == ActionTypeStatsUpdate:
return TaskTypeStatsUpdate
} }
return 0 return 0
} }
@ -132,6 +134,10 @@ func packLoadSegmentRequest(
loadScope = querypb.LoadScope_Index loadScope = querypb.LoadScope_Index
} }
if action.Type() == ActionTypeStatsUpdate {
loadScope = querypb.LoadScope_Stats
}
if task.Source() == utils.LeaderChecker { if task.Source() == utils.LeaderChecker {
loadScope = querypb.LoadScope_Delta loadScope = querypb.LoadScope_Delta
} }

View File

@ -74,22 +74,23 @@ func PackSegmentLoadInfo(segment *datapb.SegmentInfo, channelCheckpoint *msgpb.M
zap.Duration("tsLag", tsLag)) zap.Duration("tsLag", tsLag))
} }
loadInfo := &querypb.SegmentLoadInfo{ loadInfo := &querypb.SegmentLoadInfo{
SegmentID: segment.ID, SegmentID: segment.ID,
PartitionID: segment.PartitionID, PartitionID: segment.PartitionID,
CollectionID: segment.CollectionID, CollectionID: segment.CollectionID,
BinlogPaths: segment.Binlogs, BinlogPaths: segment.Binlogs,
NumOfRows: segment.NumOfRows, NumOfRows: segment.NumOfRows,
Statslogs: segment.Statslogs, Statslogs: segment.Statslogs,
Deltalogs: segment.Deltalogs, Deltalogs: segment.Deltalogs,
Bm25Logs: segment.Bm25Statslogs, Bm25Logs: segment.Bm25Statslogs,
InsertChannel: segment.InsertChannel, InsertChannel: segment.InsertChannel,
IndexInfos: indexes, IndexInfos: indexes,
StartPosition: segment.GetStartPosition(), StartPosition: segment.GetStartPosition(),
DeltaPosition: channelCheckpoint, DeltaPosition: channelCheckpoint,
Level: segment.GetLevel(), Level: segment.GetLevel(),
StorageVersion: segment.GetStorageVersion(), StorageVersion: segment.GetStorageVersion(),
IsSorted: segment.GetIsSorted(), IsSorted: segment.GetIsSorted(),
TextStatsLogs: segment.GetTextStatsLogs(), TextStatsLogs: segment.GetTextStatsLogs(),
JsonKeyStatsLogs: segment.GetJsonKeyStats(),
} }
return loadInfo return loadInfo
} }

View File

@ -175,6 +175,45 @@ func (node *QueryNode) loadIndex(ctx context.Context, req *querypb.LoadSegmentsR
return status return status
} }
func (node *QueryNode) loadStats(ctx context.Context, req *querypb.LoadSegmentsRequest) *commonpb.Status {
log := log.Ctx(ctx).With(
zap.Int64("collectionID", req.GetCollectionID()),
zap.Int64s("segmentIDs", lo.Map(req.GetInfos(), func(info *querypb.SegmentLoadInfo, _ int) int64 { return info.GetSegmentID() })),
)
status := merr.Success()
log.Info("start to load stats")
for _, info := range req.GetInfos() {
log := log.With(zap.Int64("segmentID", info.GetSegmentID()))
segment := node.manager.Segment.GetSealed(info.GetSegmentID())
if segment == nil {
log.Warn("segment not found for load stats operation")
continue
}
localSegment, ok := segment.(*segments.LocalSegment)
if !ok {
log.Warn("segment not local for load stats opeartion")
continue
}
if localSegment.IsLazyLoad() {
localSegment.SetLoadInfo(info)
localSegment.SetNeedUpdatedVersion(req.GetVersion())
node.manager.DiskCache.MarkItemNeedReload(ctx, localSegment.ID())
return nil
}
err := node.loader.LoadJSONIndex(ctx, localSegment, info)
if err != nil {
log.Warn("failed to load stats", zap.Error(err))
status = merr.Status(err)
break
}
}
return status
}
func (node *QueryNode) queryChannel(ctx context.Context, req *querypb.QueryRequest, channel string) (*internalpb.RetrieveResults, error) { func (node *QueryNode) queryChannel(ctx context.Context, req *querypb.QueryRequest, channel string) (*internalpb.RetrieveResults, error) {
msgID := req.Req.Base.GetMsgID() msgID := req.Req.Base.GetMsgID()
traceID := trace.SpanFromContext(ctx).SpanContext().TraceID() traceID := trace.SpanFromContext(ctx).SpanContext().TraceID()

Some files were not shown because too many files have changed in this diff Show More