milvus/internal/core/src/common/FieldMeta.cpp
Buqian Zheng f7b811450d
feat: add enable_tokenizer params to VarChar field (#36480)
issue: #35922

add an enable_tokenizer param to varchar field: must be set to true so
that a varchar field can enable_match or used as input of BM25 function

---------

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
2024-10-10 20:33:21 +08:00

146 lines
4.6 KiB
C++

// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "common/FieldMeta.h"
#include "common/SystemProperty.h"
#include "common/protobuf_utils.h"
#include <boost/lexical_cast.hpp>
#include "Consts.h"
namespace milvus {
TokenizerParams
ParseTokenizerParams(const TypeParams& params) {
auto iter = params.find("tokenizer_params");
if (iter == params.end()) {
return {};
}
nlohmann::json j = nlohmann::json::parse(iter->second);
std::map<std::string, std::string> ret;
for (const auto& [k, v] : j.items()) {
try {
ret[k] = v.get<std::string>();
} catch (std::exception& e) {
ret[k] = v.dump();
}
}
return ret;
}
bool
FieldMeta::enable_match() const {
if (!IsStringDataType(type_)) {
return false;
}
if (!string_info_.has_value()) {
return false;
}
return string_info_->enable_match;
}
bool
FieldMeta::enable_tokenizer() const {
if (!IsStringDataType(type_)) {
return false;
}
if (!string_info_.has_value()) {
return false;
}
return string_info_->enable_tokenizer;
}
TokenizerParams
FieldMeta::get_tokenizer_params() const {
Assert(enable_tokenizer());
auto params = string_info_->params;
return ParseTokenizerParams(params);
}
FieldMeta
FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
auto field_id = FieldId(schema_proto.fieldid());
auto name = FieldName(schema_proto.name());
auto nullable = schema_proto.nullable();
if (field_id.get() < 100) {
// system field id
auto is_system =
SystemProperty::Instance().SystemFieldVerify(name, field_id);
AssertInfo(is_system,
"invalid system type: name(" + name.get() + "), id(" +
std::to_string(field_id.get()) + ")");
}
auto data_type = DataType(schema_proto.data_type());
if (IsVectorDataType(data_type)) {
auto type_map = RepeatedKeyValToMap(schema_proto.type_params());
auto index_map = RepeatedKeyValToMap(schema_proto.index_params());
int64_t dim = 0;
if (!IsSparseFloatVectorDataType(data_type)) {
AssertInfo(type_map.count("dim"), "dim not found");
dim = boost::lexical_cast<int64_t>(type_map.at("dim"));
}
if (!index_map.count("metric_type")) {
return FieldMeta{
name, field_id, data_type, dim, std::nullopt, false};
}
auto metric_type = index_map.at("metric_type");
return FieldMeta{name, field_id, data_type, dim, metric_type, false};
}
if (IsStringDataType(data_type)) {
auto type_map = RepeatedKeyValToMap(schema_proto.type_params());
AssertInfo(type_map.count(MAX_LENGTH), "max_length not found");
auto max_len = boost::lexical_cast<int64_t>(type_map.at(MAX_LENGTH));
auto get_bool_value = [&](const std::string& key) -> bool {
if (!type_map.count(key)) {
return false;
}
auto param_str = type_map.at(key);
std::transform(param_str.begin(),
param_str.end(),
param_str.begin(),
::tolower);
std::istringstream ss(param_str);
bool b;
ss >> std::boolalpha >> b;
return b;
};
bool enable_tokenizer = get_bool_value("enable_tokenizer");
bool enable_match = get_bool_value("enable_match");
return FieldMeta{name,
field_id,
data_type,
max_len,
nullable,
enable_match,
enable_tokenizer,
type_map};
}
if (IsArrayDataType(data_type)) {
return FieldMeta{name,
field_id,
data_type,
DataType(schema_proto.element_type()),
nullable};
}
return FieldMeta{name, field_id, data_type, nullable};
}
} // namespace milvus