mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-02 00:45:30 +08:00
Issue: #22837 Move and rename following C++ APIs: datatype_sizeof() ==> GetDataTypeSize() datatype_name() ==> GetDataTypeName() datatype_is_vector() / IsVectorType() ==> IsVectorDataType() datatype_is_variable() ==> IsVariableDataType() datatype_is_sparse_vector() ==> IsSparseFloatVectorDataType() datatype_is_string() / IsString() ==> IsDataTypeString() datatype_is_floating() / IsFloat() ==> IsDataTypeFloat() datatype_is_binary() ==> IsDataTypeBinary() datatype_is_json() ==> IsDataTypeJson() datatype_is_array() ==> IsDataTypeArray() datatype_is_variable() == IsDataTypeVariable() datatype_is_integer() / IsIntegral() ==> IsDataTypeInteger() Signed-off-by: Cai Yudong <yudong.cai@zilliz.com>
325 lines
12 KiB
C++
325 lines
12 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
#include "SegmentInterface.h"
|
|
|
|
#include <cstdint>
|
|
|
|
#include "Utils.h"
|
|
#include "common/EasyAssert.h"
|
|
#include "common/SystemProperty.h"
|
|
#include "common/Tracer.h"
|
|
#include "common/Types.h"
|
|
#include "query/generated/ExecPlanNodeVisitor.h"
|
|
|
|
namespace milvus::segcore {
|
|
|
|
void
|
|
SegmentInternalInterface::FillPrimaryKeys(const query::Plan* plan,
|
|
SearchResult& results) const {
|
|
std::shared_lock lck(mutex_);
|
|
AssertInfo(plan, "empty plan");
|
|
auto size = results.distances_.size();
|
|
AssertInfo(results.seg_offsets_.size() == size,
|
|
"Size of result distances is not equal to size of ids");
|
|
Assert(results.primary_keys_.size() == 0);
|
|
results.primary_keys_.resize(size);
|
|
|
|
auto pk_field_id_opt = get_schema().get_primary_field_id();
|
|
AssertInfo(pk_field_id_opt.has_value(),
|
|
"Cannot get primary key offset from schema");
|
|
auto pk_field_id = pk_field_id_opt.value();
|
|
AssertInfo(IsPrimaryKeyDataType(get_schema()[pk_field_id].get_data_type()),
|
|
"Primary key field is not INT64 or VARCHAR type");
|
|
|
|
auto field_data =
|
|
bulk_subscript(pk_field_id, results.seg_offsets_.data(), size);
|
|
results.pk_type_ = DataType(field_data->type());
|
|
|
|
ParsePksFromFieldData(results.primary_keys_, *field_data.get());
|
|
}
|
|
|
|
void
|
|
SegmentInternalInterface::FillTargetEntry(const query::Plan* plan,
|
|
SearchResult& results) const {
|
|
std::shared_lock lck(mutex_);
|
|
AssertInfo(plan, "empty plan");
|
|
auto size = results.distances_.size();
|
|
AssertInfo(results.seg_offsets_.size() == size,
|
|
"Size of result distances is not equal to size of ids");
|
|
|
|
// fill other entries except primary key by result_offset
|
|
for (auto field_id : plan->target_entries_) {
|
|
auto field_data =
|
|
bulk_subscript(field_id, results.seg_offsets_.data(), size);
|
|
results.output_fields_data_[field_id] = std::move(field_data);
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<SearchResult>
|
|
SegmentInternalInterface::Search(
|
|
const query::Plan* plan,
|
|
const query::PlaceholderGroup* placeholder_group,
|
|
Timestamp timestamp) const {
|
|
std::shared_lock lck(mutex_);
|
|
milvus::tracer::AddEvent("obtained_segment_lock_mutex");
|
|
check_search(plan);
|
|
query::ExecPlanNodeVisitor visitor(*this, timestamp, placeholder_group);
|
|
auto results = std::make_unique<SearchResult>();
|
|
*results = visitor.get_moved_result(*plan->plan_node_);
|
|
results->segment_ = (void*)this;
|
|
return results;
|
|
}
|
|
|
|
std::unique_ptr<proto::segcore::RetrieveResults>
|
|
SegmentInternalInterface::Retrieve(const query::RetrievePlan* plan,
|
|
Timestamp timestamp,
|
|
int64_t limit_size) const {
|
|
std::shared_lock lck(mutex_);
|
|
auto results = std::make_unique<proto::segcore::RetrieveResults>();
|
|
query::ExecPlanNodeVisitor visitor(*this, timestamp);
|
|
auto retrieve_results = visitor.get_retrieve_result(*plan->plan_node_);
|
|
retrieve_results.segment_ = (void*)this;
|
|
|
|
auto result_rows = retrieve_results.result_offsets_.size();
|
|
int64_t output_data_size = 0;
|
|
for (auto field_id : plan->field_ids_) {
|
|
output_data_size += get_field_avg_size(field_id) * result_rows;
|
|
}
|
|
if (output_data_size > limit_size) {
|
|
throw SegcoreError(
|
|
RetrieveError,
|
|
fmt::format("query results exceed the limit size ", limit_size));
|
|
}
|
|
|
|
results->set_all_retrieve_count(retrieve_results.total_data_cnt_);
|
|
if (plan->plan_node_->is_count_) {
|
|
AssertInfo(retrieve_results.field_data_.size() == 1,
|
|
"count result should only have one column");
|
|
*results->add_fields_data() = retrieve_results.field_data_[0];
|
|
return results;
|
|
}
|
|
|
|
results->mutable_offset()->Add(retrieve_results.result_offsets_.begin(),
|
|
retrieve_results.result_offsets_.end());
|
|
|
|
auto fields_data = results->mutable_fields_data();
|
|
auto ids = results->mutable_ids();
|
|
auto pk_field_id = plan->schema_.get_primary_field_id();
|
|
for (auto field_id : plan->field_ids_) {
|
|
if (SystemProperty::Instance().IsSystem(field_id)) {
|
|
auto system_type =
|
|
SystemProperty::Instance().GetSystemFieldType(field_id);
|
|
|
|
auto size = retrieve_results.result_offsets_.size();
|
|
FixedVector<int64_t> output(size);
|
|
bulk_subscript(system_type,
|
|
retrieve_results.result_offsets_.data(),
|
|
size,
|
|
output.data());
|
|
|
|
auto data_array = std::make_unique<DataArray>();
|
|
data_array->set_field_id(field_id.get());
|
|
data_array->set_type(milvus::proto::schema::DataType::Int64);
|
|
|
|
auto scalar_array = data_array->mutable_scalars();
|
|
auto data = reinterpret_cast<const int64_t*>(output.data());
|
|
auto obj = scalar_array->mutable_long_data();
|
|
obj->mutable_data()->Add(data, data + size);
|
|
fields_data->AddAllocated(data_array.release());
|
|
continue;
|
|
}
|
|
|
|
auto& field_meta = plan->schema_[field_id];
|
|
|
|
auto col = bulk_subscript(field_id,
|
|
retrieve_results.result_offsets_.data(),
|
|
retrieve_results.result_offsets_.size());
|
|
if (field_meta.get_data_type() == DataType::ARRAY) {
|
|
col->mutable_scalars()->mutable_array_data()->set_element_type(
|
|
proto::schema::DataType(field_meta.get_element_type()));
|
|
}
|
|
auto col_data = col.release();
|
|
fields_data->AddAllocated(col_data);
|
|
if (pk_field_id.has_value() && pk_field_id.value() == field_id) {
|
|
switch (field_meta.get_data_type()) {
|
|
case DataType::INT64: {
|
|
auto int_ids = ids->mutable_int_id();
|
|
auto& src_data = col_data->scalars().long_data();
|
|
int_ids->mutable_data()->Add(src_data.data().begin(),
|
|
src_data.data().end());
|
|
break;
|
|
}
|
|
case DataType::VARCHAR: {
|
|
auto str_ids = ids->mutable_str_id();
|
|
auto& src_data = col_data->scalars().string_data();
|
|
for (auto i = 0; i < src_data.data_size(); ++i) {
|
|
*(str_ids->mutable_data()->Add()) = src_data.data(i);
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
PanicInfo(DataTypeInvalid,
|
|
fmt::format("unsupported datatype {}",
|
|
field_meta.get_data_type()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
int64_t
|
|
SegmentInternalInterface::get_real_count() const {
|
|
#if 0
|
|
auto insert_cnt = get_row_count();
|
|
BitsetType bitset_holder;
|
|
bitset_holder.resize(insert_cnt, false);
|
|
mask_with_delete(bitset_holder, insert_cnt, MAX_TIMESTAMP);
|
|
return bitset_holder.size() - bitset_holder.count();
|
|
#endif
|
|
auto plan = std::make_unique<query::RetrievePlan>(get_schema());
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
plan->plan_node_->is_count_ = true;
|
|
auto res = Retrieve(plan.get(), MAX_TIMESTAMP, INT64_MAX);
|
|
AssertInfo(res->fields_data().size() == 1,
|
|
"count result should only have one column");
|
|
AssertInfo(res->fields_data()[0].has_scalars(),
|
|
"count result should match scalar");
|
|
AssertInfo(res->fields_data()[0].scalars().has_long_data(),
|
|
"count result should match long data");
|
|
AssertInfo(res->fields_data()[0].scalars().long_data().data_size() == 1,
|
|
"count result should only have one row");
|
|
return res->fields_data()[0].scalars().long_data().data(0);
|
|
}
|
|
|
|
int64_t
|
|
SegmentInternalInterface::get_field_avg_size(FieldId field_id) const {
|
|
AssertInfo(field_id.get() >= 0,
|
|
"invalid field id, should be greater than or equal to 0");
|
|
if (SystemProperty::Instance().IsSystem(field_id)) {
|
|
if (field_id == TimestampFieldID || field_id == RowFieldID) {
|
|
return sizeof(int64_t);
|
|
}
|
|
|
|
throw SegcoreError(FieldIDInvalid, "unsupported system field id");
|
|
}
|
|
|
|
auto schema = get_schema();
|
|
auto& field_meta = schema[field_id];
|
|
auto data_type = field_meta.get_data_type();
|
|
|
|
std::shared_lock lck(mutex_);
|
|
if (IsVariableDataType(data_type)) {
|
|
if (variable_fields_avg_size_.find(field_id) ==
|
|
variable_fields_avg_size_.end()) {
|
|
return 0;
|
|
}
|
|
|
|
return variable_fields_avg_size_.at(field_id).second;
|
|
} else {
|
|
return field_meta.get_sizeof();
|
|
}
|
|
}
|
|
|
|
void
|
|
SegmentInternalInterface::set_field_avg_size(FieldId field_id,
|
|
int64_t num_rows,
|
|
int64_t field_size) {
|
|
AssertInfo(field_id.get() >= 0,
|
|
"invalid field id, should be greater than or equal to 0");
|
|
auto schema = get_schema();
|
|
auto& field_meta = schema[field_id];
|
|
auto data_type = field_meta.get_data_type();
|
|
|
|
std::unique_lock lck(mutex_);
|
|
if (IsVariableDataType(data_type)) {
|
|
AssertInfo(num_rows > 0,
|
|
"The num rows of field data should be greater than 0");
|
|
if (variable_fields_avg_size_.find(field_id) ==
|
|
variable_fields_avg_size_.end()) {
|
|
variable_fields_avg_size_.emplace(field_id, std::make_pair(0, 0));
|
|
}
|
|
|
|
auto& field_info = variable_fields_avg_size_.at(field_id);
|
|
auto size = field_info.first * field_info.second + field_size;
|
|
field_info.first = field_info.first + num_rows;
|
|
field_info.second = size / field_info.first;
|
|
}
|
|
}
|
|
|
|
void
|
|
SegmentInternalInterface::timestamp_filter(BitsetType& bitset,
|
|
Timestamp timestamp) const {
|
|
auto& timestamps = get_timestamps();
|
|
auto cnt = bitset.size();
|
|
if (timestamps[cnt - 1] <= timestamp) {
|
|
// no need to filter out anything.
|
|
return;
|
|
}
|
|
|
|
auto pilot = upper_bound(timestamps, 0, cnt, timestamp);
|
|
// offset bigger than pilot should be filtered out.
|
|
auto offset = pilot;
|
|
while (offset < cnt) {
|
|
bitset[offset] = false;
|
|
|
|
const auto next_offset = bitset.find_next(offset);
|
|
if (!next_offset.has_value()) {
|
|
return;
|
|
}
|
|
offset = next_offset.value();
|
|
}
|
|
}
|
|
|
|
void
|
|
SegmentInternalInterface::timestamp_filter(BitsetType& bitset,
|
|
const std::vector<int64_t>& offsets,
|
|
Timestamp timestamp) const {
|
|
auto& timestamps = get_timestamps();
|
|
auto cnt = bitset.size();
|
|
if (timestamps[cnt - 1] <= timestamp) {
|
|
// no need to filter out anything.
|
|
return;
|
|
}
|
|
|
|
// point query, faster than binary search.
|
|
for (auto& offset : offsets) {
|
|
if (timestamps[offset] > timestamp) {
|
|
bitset.set(offset, true);
|
|
}
|
|
}
|
|
}
|
|
|
|
const SkipIndex&
|
|
SegmentInternalInterface::GetSkipIndex() const {
|
|
return skip_index_;
|
|
}
|
|
|
|
void
|
|
SegmentInternalInterface::LoadPrimitiveSkipIndex(milvus::FieldId field_id,
|
|
int64_t chunk_id,
|
|
milvus::DataType data_type,
|
|
const void* chunk_data,
|
|
int64_t count) {
|
|
skip_index_.LoadPrimitive(field_id, chunk_id, data_type, chunk_data, count);
|
|
}
|
|
|
|
void
|
|
SegmentInternalInterface::LoadStringSkipIndex(
|
|
milvus::FieldId field_id,
|
|
int64_t chunk_id,
|
|
const milvus::VariableColumn<std::string>& var_column) {
|
|
skip_index_.LoadString(field_id, chunk_id, var_column);
|
|
}
|
|
|
|
} // namespace milvus::segcore
|