Reverse data from scalar index (#17145)

Signed-off-by: xige-16 <xi.ge@zilliz.com>
This commit is contained in:
xige-16 2022-05-26 14:58:01 +08:00 committed by GitHub
parent 3928da6493
commit 56778787be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1403 additions and 204 deletions

View File

@ -24,6 +24,7 @@ endif ()
target_link_libraries(milvus_index
milvus_proto
milvus_exceptions
knowhere
${PLATFORM_LIBS}
)

View File

@ -38,6 +38,9 @@ class ScalarIndex : public IndexBase {
virtual const TargetBitmapPtr
Range(T lower_bound_value, bool lb_inclusive, T upper_bound_value, bool ub_inclusive) = 0;
virtual T
Reverse_Lookup(size_t offset) const = 0;
const TargetBitmapPtr
Query(const DatasetPtr& dataset) override;
};

View File

@ -42,6 +42,7 @@ template <typename T>
inline void
ScalarIndexSort<T>::Build(const size_t n, const T* values) {
data_.reserve(n);
idx_to_offsets_.resize(n);
T* p = const_cast<T*>(values);
for (size_t i = 0; i < n; ++i) {
data_.emplace_back(IndexStructure(*p++, i));
@ -59,15 +60,16 @@ ScalarIndexSort<T>::build() {
throw std::invalid_argument("ScalarIndexSort cannot build null values!");
}
std::sort(data_.begin(), data_.end());
for (size_t i = 0; i < data_.size(); ++i) {
idx_to_offsets_[data_[i].idx_] = i;
}
is_built_ = true;
}
template <typename T>
inline BinarySet
ScalarIndexSort<T>::Serialize(const Config& config) {
if (!is_built_) {
build();
}
AssertInfo(is_built_, "index has not been built");
auto index_data_size = data_.size() * sizeof(IndexStructure<T>);
std::shared_ptr<uint8_t[]> index_data(new uint8_t[index_data_size]);
@ -92,16 +94,18 @@ ScalarIndexSort<T>::Load(const BinarySet& index_binary) {
auto index_data = index_binary.GetByName("index_data");
data_.resize(index_size);
idx_to_offsets_.resize(index_size);
memcpy(data_.data(), index_data->data.get(), (size_t)index_data->size);
for (size_t i = 0; i < data_.size(); ++i) {
idx_to_offsets_[data_[i].idx_] = i;
}
is_built_ = true;
}
template <typename T>
inline const TargetBitmapPtr
ScalarIndexSort<T>::In(const size_t n, const T* values) {
if (!is_built_) {
build();
}
AssertInfo(is_built_, "index has not been built");
TargetBitmapPtr bitset = std::make_unique<TargetBitmap>(data_.size());
for (size_t i = 0; i < n; ++i) {
auto lb = std::lower_bound(data_.begin(), data_.end(), IndexStructure<T>(*(values + i)));
@ -120,9 +124,7 @@ ScalarIndexSort<T>::In(const size_t n, const T* values) {
template <typename T>
inline const TargetBitmapPtr
ScalarIndexSort<T>::NotIn(const size_t n, const T* values) {
if (!is_built_) {
build();
}
AssertInfo(is_built_, "index has not been built");
TargetBitmapPtr bitset = std::make_unique<TargetBitmap>(data_.size());
bitset->set();
for (size_t i = 0; i < n; ++i) {
@ -142,9 +144,7 @@ ScalarIndexSort<T>::NotIn(const size_t n, const T* values) {
template <typename T>
inline const TargetBitmapPtr
ScalarIndexSort<T>::Range(const T value, const OpType op) {
if (!is_built_) {
build();
}
AssertInfo(is_built_, "index has not been built");
TargetBitmapPtr bitset = std::make_unique<TargetBitmap>(data_.size());
auto lb = data_.begin();
auto ub = data_.end();
@ -173,13 +173,11 @@ ScalarIndexSort<T>::Range(const T value, const OpType op) {
template <typename T>
inline const TargetBitmapPtr
ScalarIndexSort<T>::Range(T lower_bound_value, bool lb_inclusive, T upper_bound_value, bool ub_inclusive) {
if (!is_built_) {
build();
}
AssertInfo(is_built_, "index has not been built");
TargetBitmapPtr bitset = std::make_unique<TargetBitmap>(data_.size());
if (lower_bound_value > upper_bound_value) {
std::swap(lower_bound_value, upper_bound_value);
std::swap(lb_inclusive, ub_inclusive);
if (lower_bound_value > upper_bound_value ||
(lower_bound_value == upper_bound_value && !(lb_inclusive && ub_inclusive))) {
return bitset;
}
auto lb = data_.begin();
auto ub = data_.end();
@ -199,4 +197,14 @@ ScalarIndexSort<T>::Range(T lower_bound_value, bool lb_inclusive, T upper_bound_
return bitset;
}
template <typename T>
inline T
ScalarIndexSort<T>::Reverse_Lookup(size_t idx) const {
AssertInfo(idx < idx_to_offsets_.size(), "out of range of total count");
AssertInfo(is_built_, "index has not been built");
auto offset = idx_to_offsets_[idx];
return data_[offset].a_;
}
} // namespace milvus::scalar

View File

@ -15,9 +15,9 @@
#include <memory>
#include <utility>
#include <vector>
#include <string>
#include "knowhere/common/Exception.h"
#include "index/IndexStructure.h"
#include <string>
#include "index/ScalarIndex.h"
namespace milvus::scalar {
@ -60,6 +60,9 @@ class ScalarIndexSort : public ScalarIndex<T> {
const TargetBitmapPtr
Range(T lower_bound_value, bool lb_inclusive, T upper_bound_value, bool ub_inclusive) override;
T
Reverse_Lookup(size_t offset) const override;
public:
const std::vector<IndexStructure<T>>&
GetData() {
@ -78,6 +81,7 @@ class ScalarIndexSort : public ScalarIndex<T> {
private:
bool is_built_;
std::vector<size_t> idx_to_offsets_; // used to retrieve.
std::vector<IndexStructure<T>> data_;
};

View File

@ -9,10 +9,6 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "index/StringIndexMarisa.h"
#include "index/Utils.h"
#include "index/Index.h"
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_io.hpp>
#include <boost/uuid/uuid_generators.hpp>
@ -20,7 +16,11 @@
#include <stdio.h>
#include <fcntl.h>
#include <knowhere/common/Utils.h>
#include "exceptions/EasyAssert.h"
#include "index/StringIndexMarisa.h"
#include "index/Utils.h"
#include "index/Index.h"
#include "common/Utils.h"
namespace milvus::scalar {
@ -150,7 +150,35 @@ StringIndexMarisa::NotIn(size_t n, const std::string* values) {
const TargetBitmapPtr
StringIndexMarisa::Range(std::string value, OpType op) {
throw std::runtime_error("todo: unsupported now");
auto count = Count();
TargetBitmapPtr bitset = std::make_unique<TargetBitmap>(count);
marisa::Agent agent;
for (size_t offset = 0; offset < count; ++offset) {
agent.set_query(str_ids_[offset]);
trie_.reverse_lookup(agent);
std::string raw_data(agent.key().ptr(), agent.key().length());
bool set = false;
switch (op) {
case OpType::LessThan:
set = raw_data.compare(value) < 0;
break;
case OpType::LessEqual:
set = raw_data.compare(value) <= 0;
break;
case OpType::GreaterThan:
set = raw_data.compare(value) > 0;
break;
case OpType::GreaterEqual:
set = raw_data.compare(value) >= 0;
break;
default:
throw std::invalid_argument(std::string("Invalid OperatorType: ") + std::to_string((int)op) + "!");
}
if (set) {
bitset->set(offset);
}
}
return bitset;
}
const TargetBitmapPtr
@ -158,7 +186,33 @@ StringIndexMarisa::Range(std::string lower_bound_value,
bool lb_inclusive,
std::string upper_bound_value,
bool ub_inclusive) {
throw std::runtime_error("todo: unsupported now");
auto count = Count();
TargetBitmapPtr bitset = std::make_unique<TargetBitmap>(count);
if (lower_bound_value.compare(upper_bound_value) > 0 ||
(lower_bound_value.compare(upper_bound_value) == 0 && !(lb_inclusive && ub_inclusive))) {
return bitset;
}
marisa::Agent agent;
for (size_t offset = 0; offset < count; ++offset) {
agent.set_query(str_ids_[offset]);
trie_.reverse_lookup(agent);
std::string raw_data(agent.key().ptr(), agent.key().length());
bool set = true;
if (lb_inclusive) {
set &= raw_data.compare(lower_bound_value) >= 0;
} else {
set &= raw_data.compare(lower_bound_value) > 0;
}
if (ub_inclusive) {
set &= raw_data.compare(upper_bound_value) <= 0;
} else {
set &= raw_data.compare(upper_bound_value) < 0;
}
if (set) {
bitset->set(offset);
}
}
return bitset;
}
const TargetBitmapPtr
@ -215,6 +269,15 @@ StringIndexMarisa::prefix_match(const std::string& prefix) {
return ret;
}
std::string
StringIndexMarisa::Reverse_Lookup(size_t offset) const {
AssertInfo(offset < str_ids_.size(), "out of range of total count");
marisa::Agent agent;
agent.set_query(str_ids_[offset]);
trie_.reverse_lookup(agent);
return std::string(agent.key().ptr(), agent.key().length());
}
#endif
} // namespace milvus::scalar

View File

@ -58,6 +58,9 @@ class StringIndexMarisa : public StringIndex {
const TargetBitmapPtr
PrefixMatch(std::string prefix) override;
std::string
Reverse_Lookup(size_t offset) const override;
private:
void
fill_str_ids(size_t n, const std::string* values);

View File

@ -29,6 +29,8 @@
namespace milvus::query {
using optype = proto::plan::OpType;
class ExprVisitor;
// Base of all Exprs

View File

@ -64,9 +64,9 @@ class ExecExprVisitor : public ExprVisitor {
auto
ExecRangeVisitorImpl(FieldId field_id, IndexFunc func, ElementFunc element_func) -> BitsetType;
template <typename T, typename ElementFunc>
template <typename T, typename IndexFunc, typename ElementFunc>
auto
ExecDataRangeVisitorImpl(FieldId field_id, ElementFunc element_func) -> BitsetType;
ExecDataRangeVisitorImpl(FieldId field_id, IndexFunc index_func, ElementFunc element_func) -> BitsetType;
template <typename T>
auto

View File

@ -179,16 +179,25 @@ ExecExprVisitor::ExecRangeVisitorImpl(FieldId field_id, IndexFunc index_func, El
return final_result;
}
template <typename T, typename ElementFunc>
template <typename T, typename IndexFunc, typename ElementFunc>
auto
ExecExprVisitor::ExecDataRangeVisitorImpl(FieldId field_id, ElementFunc element_func) -> BitsetType {
ExecExprVisitor::ExecDataRangeVisitorImpl(FieldId field_id, IndexFunc index_func, ElementFunc element_func)
-> BitsetType {
auto& schema = segment_.get_schema();
auto& field_meta = schema[field_id];
auto size_per_chunk = segment_.size_per_chunk();
auto num_chunk = upper_div(row_count_, size_per_chunk);
auto indexing_barrier = segment_.num_chunk_index(field_id);
auto data_barrier = segment_.num_chunk_data(field_id);
AssertInfo(std::max(data_barrier, indexing_barrier) == num_chunk,
"max(data_barrier, index_barrier) not equal to num_chunk");
std::deque<BitsetType> results;
for (auto chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
// for growing segment, indexing_barrier will always less than data_barrier
// so growing segment will always execute expr plan using raw data
// if sealed segment has loaded raw data on this field, then index_barrier = 0 and data_barrier = 1
// in this case, sealed segment execute expr plan using raw data
for (auto chunk_id = 0; chunk_id < data_barrier; ++chunk_id) {
auto this_size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * size_per_chunk : size_per_chunk;
BitsetType result(this_size);
auto chunk = segment_.chunk_data<T>(field_id, chunk_id);
@ -199,6 +208,20 @@ ExecExprVisitor::ExecDataRangeVisitorImpl(FieldId field_id, ElementFunc element_
AssertInfo(result.size() == this_size, "[ExecExprVisitor]Chunk result size not equal to expected size");
results.emplace_back(std::move(result));
}
// if sealed segment has loaded scalar index for this field, then index_barrier = 1 and data_barrier = 0
// in this case, sealed segment execute expr plan using scalar index
using Index = scalar::ScalarIndex<T>;
for (auto chunk_id = data_barrier; chunk_id < indexing_barrier; ++chunk_id) {
auto& indexing = segment_.chunk_scalar_index<T>(field_id, chunk_id);
auto this_size = const_cast<Index*>(&indexing)->Count();
BitsetType result(this_size);
for (int offset = 0; offset < this_size; ++offset) {
result[offset] = index_func(const_cast<Index*>(&indexing), offset);
}
results.emplace_back(std::move(result));
}
auto final_result = Assemble(results);
AssertInfo(final_result.size() == row_count_, "[ExecExprVisitor]Final result size not equal to row count");
return final_result;
@ -278,26 +301,46 @@ ExecExprVisitor::ExecBinaryArithOpEvalRangeVisitorDispatcher(BinaryArithOpEvalRa
case OpType::Equal: {
switch (arith_op) {
case ArithOpType::Add: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return (x + right_operand) == val;
};
auto elem_func = [val, right_operand](T x) { return ((x + right_operand) == val); };
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case ArithOpType::Sub: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return (x - right_operand) == val;
};
auto elem_func = [val, right_operand](T x) { return ((x - right_operand) == val); };
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case ArithOpType::Mul: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return (x * right_operand) == val;
};
auto elem_func = [val, right_operand](T x) { return ((x * right_operand) == val); };
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case ArithOpType::Div: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return (x / right_operand) == val;
};
auto elem_func = [val, right_operand](T x) { return ((x / right_operand) == val); };
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case ArithOpType::Mod: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return static_cast<T>(fmod(x, right_operand)) == val;
};
auto elem_func = [val, right_operand](T x) {
return (static_cast<T>(fmod(x, right_operand)) == val);
};
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
default: {
PanicInfo("unsupported arithmetic operation");
@ -307,26 +350,46 @@ ExecExprVisitor::ExecBinaryArithOpEvalRangeVisitorDispatcher(BinaryArithOpEvalRa
case OpType::NotEqual: {
switch (arith_op) {
case ArithOpType::Add: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return (x + right_operand) != val;
};
auto elem_func = [val, right_operand](T x) { return ((x + right_operand) != val); };
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case ArithOpType::Sub: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return (x - right_operand) != val;
};
auto elem_func = [val, right_operand](T x) { return ((x - right_operand) != val); };
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case ArithOpType::Mul: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return (x * right_operand) != val;
};
auto elem_func = [val, right_operand](T x) { return ((x * right_operand) != val); };
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case ArithOpType::Div: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return (x / right_operand) != val;
};
auto elem_func = [val, right_operand](T x) { return ((x / right_operand) != val); };
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case ArithOpType::Mod: {
auto index_func = [val, right_operand](Index* index, size_t offset) {
auto x = index->Reverse_Lookup(offset);
return static_cast<T>(fmod(x, right_operand)) != val;
};
auto elem_func = [val, right_operand](T x) {
return (static_cast<T>(fmod(x, right_operand)) != val);
};
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
default: {
PanicInfo("unsupported arithmetic operation");
@ -351,11 +414,7 @@ ExecExprVisitor::ExecBinaryRangeVisitorDispatcher(BinaryRangeExpr& expr_raw) ->
bool upper_inclusive = expr.upper_inclusive_;
T val1 = expr.lower_value_;
T val2 = expr.upper_value_;
// TODO: disable check?
if (val1 > val2 || (val1 == val2 && !(lower_inclusive && upper_inclusive))) {
BitsetType res(row_count_, false);
return res;
}
auto index_func = [=](Index* index) { return index->Range(val1, lower_inclusive, val2, upper_inclusive); };
if (lower_inclusive && upper_inclusive) {
auto elem_func = [val1, val2](T x) { return (val1 <= x && x <= val2); };
@ -524,48 +583,109 @@ ExecExprVisitor::ExecCompareExprDispatcher(CompareExpr& expr, Op op) -> BitsetTy
auto size_per_chunk = segment_.size_per_chunk();
auto num_chunk = upper_div(row_count_, size_per_chunk);
std::deque<BitsetType> bitsets;
// check for sealed segment, load either raw field data or index
auto left_indexing_barrier = segment_.num_chunk_index(expr.left_field_id_);
auto left_data_barrier = segment_.num_chunk_data(expr.left_field_id_);
AssertInfo(std::max(left_data_barrier, left_indexing_barrier) == num_chunk,
"max(left_data_barrier, left_indexing_barrier) not equal to num_chunk");
auto right_indexing_barrier = segment_.num_chunk_index(expr.right_field_id_);
auto right_data_barrier = segment_.num_chunk_data(expr.right_field_id_);
AssertInfo(std::max(right_data_barrier, right_indexing_barrier) == num_chunk,
"max(right_data_barrier, right_indexing_barrier) not equal to num_chunk");
for (int64_t chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
auto size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * size_per_chunk : size_per_chunk;
auto getChunkData = [&, chunk_id](DataType type, FieldId field_id) -> std::function<const number(int)> {
auto getChunkData = [&, chunk_id](DataType type, FieldId field_id,
int64_t data_barrier) -> std::function<const number(int)> {
switch (type) {
case DataType::BOOL: {
auto chunk_data = segment_.chunk_data<bool>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
if (chunk_id < data_barrier) {
auto chunk_data = segment_.chunk_data<bool>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
} else {
// for case, sealed segment has loaded index for scalar field instead of raw data
auto& indexing = segment_.chunk_scalar_index<bool>(field_id, chunk_id);
return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); };
}
}
case DataType::INT8: {
auto chunk_data = segment_.chunk_data<int8_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
if (chunk_id < data_barrier) {
auto chunk_data = segment_.chunk_data<int8_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
} else {
// for case, sealed segment has loaded index for scalar field instead of raw data
auto& indexing = segment_.chunk_scalar_index<int8_t>(field_id, chunk_id);
return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); };
}
}
case DataType::INT16: {
auto chunk_data = segment_.chunk_data<int16_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
if (chunk_id < data_barrier) {
auto chunk_data = segment_.chunk_data<int16_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
} else {
// for case, sealed segment has loaded index for scalar field instead of raw data
auto& indexing = segment_.chunk_scalar_index<int16_t>(field_id, chunk_id);
return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); };
}
}
case DataType::INT32: {
auto chunk_data = segment_.chunk_data<int32_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
if (chunk_id < data_barrier) {
auto chunk_data = segment_.chunk_data<int32_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
} else {
// for case, sealed segment has loaded index for scalar field instead of raw data
auto& indexing = segment_.chunk_scalar_index<int32_t>(field_id, chunk_id);
return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); };
}
}
case DataType::INT64: {
auto chunk_data = segment_.chunk_data<int64_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
if (chunk_id < data_barrier) {
auto chunk_data = segment_.chunk_data<int64_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
} else {
// for case, sealed segment has loaded index for scalar field instead of raw data
auto& indexing = segment_.chunk_scalar_index<int64_t>(field_id, chunk_id);
return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); };
}
}
case DataType::FLOAT: {
auto chunk_data = segment_.chunk_data<float>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
if (chunk_id < data_barrier) {
auto chunk_data = segment_.chunk_data<float>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
} else {
// for case, sealed segment has loaded index for scalar field instead of raw data
auto& indexing = segment_.chunk_scalar_index<float>(field_id, chunk_id);
return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); };
}
}
case DataType::DOUBLE: {
auto chunk_data = segment_.chunk_data<double>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
if (chunk_id < data_barrier) {
auto chunk_data = segment_.chunk_data<double>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
} else {
// for case, sealed segment has loaded index for scalar field instead of raw data
auto& indexing = segment_.chunk_scalar_index<double>(field_id, chunk_id);
return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); };
}
}
case DataType::VARCHAR: {
auto chunk_data = segment_.chunk_data<std::string>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
if (chunk_id < data_barrier) {
auto chunk_data = segment_.chunk_data<std::string>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
} else {
// for case, sealed segment has loaded index for scalar field instead of raw data
auto& indexing = segment_.chunk_scalar_index<std::string>(field_id, chunk_id);
return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); };
}
}
default:
PanicInfo("unsupported datatype");
}
};
auto left = getChunkData(expr.left_data_type_, expr.left_field_id_);
auto right = getChunkData(expr.right_data_type_, expr.right_field_id_);
auto left = getChunkData(expr.left_data_type_, expr.left_field_id_, left_data_barrier);
auto right = getChunkData(expr.right_data_type_, expr.right_field_id_, right_data_barrier);
BitsetType bitset(size);
for (int i = 0; i < size; ++i) {

View File

@ -97,6 +97,13 @@ class SegmentGrowingImpl : public SegmentGrowing {
return indexing_record_.get_finished_ack();
}
// count of chunk that has raw data
int64_t
num_chunk_data(FieldId field_id) const final {
auto size = get_insert_record().ack_responder_.GetAck();
return upper_div(size, segcore_config_.get_chunk_rows());
}
// deprecated
const knowhere::Index*
chunk_index_impl(FieldId field_id, int64_t chunk_id) const final {

View File

@ -133,6 +133,10 @@ class SegmentInternalInterface : public SegmentInterface {
virtual int64_t
num_chunk_index(FieldId field_id) const = 0;
// count of chunk that has raw data
virtual int64_t
num_chunk_data(FieldId field_id) const = 0;
virtual void
mask_with_timestamps(BitsetType& bitset_chunk, Timestamp timestamp) const = 0;

View File

@ -80,6 +80,7 @@ void
SegmentSealedImpl::LoadVecIndex(const LoadIndexInfo& info) {
// NOTE: lock only when data is ready to avoid starvation
auto field_id = FieldId(info.field_id);
auto& field_meta = schema_->operator[](field_id);
auto index = std::dynamic_pointer_cast<knowhere::VecIndex>(info.index);
AssertInfo(info.index_params.count("metric_type"), "Can't get metric_type in index_params");
@ -88,8 +89,11 @@ SegmentSealedImpl::LoadVecIndex(const LoadIndexInfo& info) {
AssertInfo(row_count > 0, "Index count is 0");
std::unique_lock lck(mutex_);
AssertInfo(!get_bit(vecindex_ready_bitset_, field_id),
"Can't get bitset element at " + std::to_string(field_id.get()));
// Don't allow vector raw data and index exist at the same time
AssertInfo(!get_bit(field_data_ready_bitset_, field_id),
"vector index can't be loaded when raw data exists at field " + std::to_string(field_id.get()));
AssertInfo(!get_bit(index_ready_bitset_, field_id),
"vector index has been exist at " + std::to_string(field_id.get()));
if (row_count_opt_.has_value()) {
AssertInfo(row_count_opt_.value() == row_count, "load data has different row count from other columns");
} else {
@ -98,7 +102,8 @@ SegmentSealedImpl::LoadVecIndex(const LoadIndexInfo& info) {
AssertInfo(!vector_indexings_.is_ready(field_id), "vec index is not ready");
vector_indexings_.append_field_indexing(field_id, GetMetricType(metric_type_str), index);
set_bit(vecindex_ready_bitset_, field_id, true);
set_bit(index_ready_bitset_, field_id, true);
update_row_count(row_count);
lck.unlock();
}
@ -106,22 +111,52 @@ void
SegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
// NOTE: lock only when data is ready to avoid starvation
auto field_id = FieldId(info.field_id);
auto& field_meta = schema_->operator[](field_id);
auto index = std::dynamic_pointer_cast<scalar::IndexBase>(info.index);
auto row_count = index->Count();
AssertInfo(row_count > 0, "Index count is 0");
std::unique_lock lck(mutex_);
// Don't allow scalar raw data and index exist at the same time
AssertInfo(!get_bit(field_data_ready_bitset_, field_id),
"scalar index can't be loaded when raw data exists at field " + std::to_string(field_id.get()));
AssertInfo(!get_bit(index_ready_bitset_, field_id),
"scalar index has been exist at " + std::to_string(field_id.get()));
if (row_count_opt_.has_value()) {
AssertInfo(row_count_opt_.value() == row_count, "load data has different row count from other columns");
} else {
row_count_opt_ = row_count;
}
scalar_indexings_[field_id] = std::move(index);
scalar_indexings_[field_id] = index;
// reverse pk from scalar index and set pks to offset
if (schema_->get_primary_field_id() == field_id) {
AssertInfo(field_id.get() != -1, "Primary key is -1");
AssertInfo(pk2offset_.empty(), "already exists");
switch (field_meta.get_data_type()) {
case DataType::INT64: {
auto int64_index = std::dynamic_pointer_cast<scalar::ScalarIndex<int64_t>>(info.index);
for (int i = 0; i < row_count; ++i) {
pk2offset_.insert(std::make_pair(int64_index->Reverse_Lookup(i), i));
}
break;
}
case DataType::VARCHAR: {
auto string_index = std::dynamic_pointer_cast<scalar::ScalarIndex<std::string>>(info.index);
for (int i = 0; i < row_count; ++i) {
pk2offset_.insert(std::make_pair(string_index->Reverse_Lookup(i), i));
}
break;
}
default: {
PanicInfo("unsupported primary key type");
}
}
}
set_bit(field_data_ready_bitset_, field_id, true);
set_bit(index_ready_bitset_, field_id, true);
update_row_count(row_count);
lck.unlock();
}
@ -167,12 +202,15 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
auto data_type = field_meta.get_data_type();
AssertInfo(data_type == DataType(info.field_data->type()),
"field type of load data is inconsistent with the schema");
auto field_data = insert_record_.get_field_data_base(field_id);
AssertInfo(field_data->empty(), "already exists");
// write data under lock
std::unique_lock lck(mutex_);
// Don't allow raw data and index exist at the same time
AssertInfo(!get_bit(index_ready_bitset_, field_id), "field data can't be loaded when indexing exists");
auto field_data = insert_record_.get_field_data_base(field_id);
AssertInfo(field_data->empty(), "already exists");
// insert data to insertRecord
field_data->fill_chunk_data(size, info.field_data, field_meta);
AssertInfo(field_data->num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
@ -188,15 +226,6 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
}
}
if (field_meta.is_vector()) {
AssertInfo(!vector_indexings_.is_ready(field_id), "field data can't be loaded when indexing exists");
} else if (!scalar_indexings_.count(field_id)) {
// generate scalar index
std::unique_ptr<knowhere::Index> index;
index = query::generate_scalar_index(field_data->get_span_base(0), data_type);
scalar_indexings_[field_id] = std::move(index);
}
set_bit(field_data_ready_bitset_, field_id, true);
}
update_row_count(info.row_count);
@ -224,9 +253,22 @@ SegmentSealedImpl::LoadDeletedRecord(const LoadDeletedRecordInfo& info) {
deleted_record_.record_size_ = size;
}
// internal API: support scalar index only
int64_t
SegmentSealedImpl::num_chunk_index(FieldId field_id) const {
return 1;
auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_vector()) {
return int64_t(vector_indexings_.is_ready(field_id));
}
return scalar_indexings_.count(field_id);
}
int64_t
SegmentSealedImpl::num_chunk_data(FieldId field_id) const {
auto field_data = insert_record_.get_field_data_base(field_id);
AssertInfo(field_data != nullptr, "null field data ptr");
return field_data->num_chunk();
}
int64_t
@ -253,8 +295,6 @@ SegmentSealedImpl::chunk_data_impl(FieldId field_id, int64_t chunk_id) const {
const knowhere::Index*
SegmentSealedImpl::chunk_index_impl(FieldId field_id, int64_t chunk_id) const {
AssertInfo(chunk_id == 0, "Chunk_id is not equal to 0");
// TODO: support scalar index
auto ptr = scalar_indexings_.at(field_id).get();
AssertInfo(ptr, "Scalar index of " + std::to_string(field_id.get()) + " is null");
return ptr;
@ -308,7 +348,7 @@ SegmentSealedImpl::vector_search(int64_t vec_count,
auto& field_meta = schema_->operator[](field_id);
AssertInfo(field_meta.is_vector(), "The meta type of vector field is not vector type");
if (get_bit(vecindex_ready_bitset_, field_id)) {
if (get_bit(index_ready_bitset_, field_id)) {
AssertInfo(vector_indexings_.is_ready(field_id),
"vector indexes isn't ready for field " + std::to_string(field_id.get()));
query::SearchOnSealed(*schema_, vector_indexings_, search_info, query_data, query_count, bitset, output, id_);
@ -383,7 +423,7 @@ SegmentSealedImpl::DropIndex(const FieldId field_id) {
std::unique_lock lck(mutex_);
vector_indexings_.drop_field_indexing(field_id);
set_bit(vecindex_ready_bitset_, field_id, false);
set_bit(index_ready_bitset_, field_id, false);
}
void
@ -396,7 +436,7 @@ SegmentSealedImpl::check_search(const query::Plan* plan) const {
}
auto& request_fields = plan->extra_info_opt_.value().involved_fields_;
auto field_ready_bitset = field_data_ready_bitset_ | vecindex_ready_bitset_;
auto field_ready_bitset = field_data_ready_bitset_ | index_ready_bitset_;
AssertInfo(request_fields.size() == field_ready_bitset.size(),
"Request fields size not equal to field ready bitset size when check search");
auto absent_fields = request_fields - field_ready_bitset;
@ -412,7 +452,7 @@ SegmentSealedImpl::SegmentSealedImpl(SchemaPtr schema, int64_t segment_id)
: schema_(schema),
insert_record_(*schema, MAX_ROW_COUNT),
field_data_ready_bitset_(schema->size()),
vecindex_ready_bitset_(schema->size()),
index_ready_bitset_(schema->size()),
scalar_indexings_(schema->size()),
id_(segment_id) {
}
@ -509,13 +549,26 @@ SegmentSealedImpl::fill_with_empty(FieldId field_id, int64_t count) const {
std::unique_ptr<DataArray>
SegmentSealedImpl::bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const {
if (!HasFieldData(field_id)) {
auto& field_meta = schema_->operator[](field_id);
// if count == 0, return empty data array
if (count == 0) {
return fill_with_empty(field_id, count);
}
if (HasIndex(field_id)) {
// if field has load scalar index, reverse raw data from index
if (!datatype_is_vector(field_meta.get_data_type())) {
AssertInfo(num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
auto index = chunk_index_impl(field_id, 0);
return ReverseDataFromIndex(index, seg_offsets, count, field_meta);
}
// TODO: knowhere support reverse data from vector index
// Now, real data will be filled in data array using chunk manager
return fill_with_empty(field_id, count);
}
Assert(get_bit(field_data_ready_bitset_, field_id));
auto& field_meta = schema_->operator[](field_id);
auto field_data = insert_record_.get_field_data_base(field_id);
AssertInfo(field_data->num_chunk() == 1, std::string("num chunk not equal to 1 for sealed segment, num_chunk: ") +
std::to_string(field_data->num_chunk()));
@ -580,7 +633,7 @@ SegmentSealedImpl::HasIndex(FieldId field_id) const {
std::shared_lock lck(mutex_);
AssertInfo(!SystemProperty::Instance().IsSystem(field_id),
"Field id:" + std::to_string(field_id.get()) + " isn't one of system type when drop index");
return get_bit(vecindex_ready_bitset_, field_id);
return get_bit(index_ready_bitset_, field_id);
}
bool

View File

@ -64,6 +64,10 @@ class SegmentSealedImpl : public SegmentSealed {
int64_t
num_chunk_index(FieldId field_id) const override;
// count of chunk that has raw data
int64_t
num_chunk_data(FieldId field_id) const override;
int64_t
num_chunk() const override;
@ -168,7 +172,7 @@ class SegmentSealedImpl : public SegmentSealed {
private:
// segment loading state
BitsetType field_data_ready_bitset_;
BitsetType vecindex_ready_bitset_;
BitsetType index_ready_bitset_;
std::atomic<int> system_ready_count_ = 0;
// segment datas
@ -188,7 +192,6 @@ class SegmentSealedImpl : public SegmentSealed {
// pks to row offset
Pk2OffsetType pk2offset_;
// std::unique_ptr<ScalarIndexBase> primary_key_index_;
SchemaPtr schema_;
int64_t id_;

View File

@ -9,7 +9,8 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "Utils.h"
#include "segcore/Utils.h"
#include "index/ScalarIndex.h"
namespace milvus::segcore {
@ -256,6 +257,115 @@ MergeDataArray(std::vector<std::pair<milvus::SearchResult*, int64_t>>& result_of
return data_array;
}
// TODO: split scalar IndexBase with knowhere::Index
std::unique_ptr<DataArray>
ReverseDataFromIndex(const knowhere::Index* index,
const int64_t* seg_offsets,
int64_t count,
const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
auto data_array = std::make_unique<DataArray>();
data_array->set_field_id(field_meta.get_id().get());
data_array->set_type(milvus::proto::schema::DataType(field_meta.get_data_type()));
auto scalar_array = data_array->mutable_scalars();
switch (data_type) {
case DataType::BOOL: {
using IndexType = scalar::ScalarIndex<bool>;
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<bool> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
}
auto obj = scalar_array->mutable_bool_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
break;
}
case DataType::INT8: {
using IndexType = scalar::ScalarIndex<int8_t>;
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<int8_t> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
}
auto obj = scalar_array->mutable_int_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
break;
}
case DataType::INT16: {
using IndexType = scalar::ScalarIndex<int16_t>;
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<int16_t> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
}
auto obj = scalar_array->mutable_int_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
break;
}
case DataType::INT32: {
using IndexType = scalar::ScalarIndex<int32_t>;
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<int32_t> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
}
auto obj = scalar_array->mutable_int_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
break;
}
case DataType::INT64: {
using IndexType = scalar::ScalarIndex<int64_t>;
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<int64_t> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
}
auto obj = scalar_array->mutable_long_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
break;
}
case DataType::FLOAT: {
using IndexType = scalar::ScalarIndex<float>;
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<float> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
}
auto obj = scalar_array->mutable_float_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
break;
}
case DataType::DOUBLE: {
using IndexType = scalar::ScalarIndex<double>;
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<double> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
}
auto obj = scalar_array->mutable_double_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
break;
}
case DataType::VARCHAR: {
using IndexType = scalar::ScalarIndex<std::string>;
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<std::string> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
}
auto obj = scalar_array->mutable_string_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
break;
}
default: {
PanicInfo("unsupported datatype");
}
}
return data_array;
}
// insert_barrier means num row of insert data in a segment
// del_barrier means that if the pk of the insert data is in delete record[0 : del_barrier]
// then the data corresponding to this pk may be ignored when searching/querying

View File

@ -17,9 +17,10 @@
#include <vector>
#include <stdexcept>
#include <knowhere/common/MetricType.h>
#include "knowhere/index/Index.h"
#include "common/QueryResult.h"
#include "DeletedRecord.h"
#include "InsertRecord.h"
#include "segcore/DeletedRecord.h"
#include "segcore/InsertRecord.h"
namespace milvus::segcore {
@ -89,4 +90,10 @@ get_deleted_bitmap(int64_t del_barrier,
const Pk2OffsetType& pk2offset,
Timestamp query_timestamp);
std::unique_ptr<DataArray>
ReverseDataFromIndex(const knowhere::Index* index,
const int64_t* seg_offsets,
int64_t count,
const FieldMeta& field_meta);
} // namespace milvus::segcore

View File

@ -95,14 +95,14 @@ Search_Sealed(benchmark::State& state) {
auto dataset_ = DataGen(schema, N);
return dataset_;
}();
SealedLoader(dataset_, *segment);
SealedLoadFieldData(dataset_, *segment);
auto choice = state.range(0);
if (choice == 0) {
// Brute Force
} else if (choice == 1) {
// ivf
auto vec = dataset_.get_col<float>(milvus::FieldId(100));
auto indexing = GenIndexing(N, dim, vec.data());
auto indexing = GenVecIndexing(N, dim, vec.data());
LoadIndexInfo info;
info.index = indexing;
info.field_id = (*schema)[FieldName("fakevec")].get_id().get();

View File

@ -66,22 +66,6 @@ get_default_schema_config() {
return conf.c_str();
}
std::vector<char>
translate_text_plan_to_binary_plan(const char* text_plan) {
proto::plan::PlanNode plan_node;
auto ok = google::protobuf::TextFormat::ParseFromString(text_plan, &plan_node);
AssertInfo(ok, "Failed to parse");
std::string binary_plan;
plan_node.SerializeToString(&binary_plan);
std::vector<char> ret;
ret.resize(binary_plan.size());
std::memcpy(ret.data(), binary_plan.c_str(), binary_plan.size());
return ret;
}
auto
generate_data(int N) {
std::vector<char> raw_data;
@ -1417,7 +1401,11 @@ TEST(CApiTest, Indexing_Without_Predicate) {
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -1538,7 +1526,11 @@ TEST(CApiTest, Indexing_Expr_Without_Predicate) {
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -1677,7 +1669,11 @@ TEST(CApiTest, Indexing_With_float_Predicate_Range) {
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -1830,7 +1826,11 @@ TEST(CApiTest, Indexing_Expr_With_float_Predicate_Range) {
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -1967,7 +1967,11 @@ TEST(CApiTest, Indexing_With_float_Predicate_Term) {
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -2113,7 +2117,11 @@ TEST(CApiTest, Indexing_Expr_With_float_Predicate_Term) {
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -2252,7 +2260,11 @@ TEST(CApiTest, Indexing_With_binary_Predicate_Range) {
AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -2404,7 +2416,11 @@ TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Range) {
AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -2543,7 +2559,11 @@ TEST(CApiTest, Indexing_With_binary_Predicate_Term) {
AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -2704,7 +2724,11 @@ TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Term) {
AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -2912,7 +2936,11 @@ TEST(CApiTest, SealedSegment_search_float_Predicate_Range) {
status = LoadFieldData(segment, c_ts_field_data);
assert(status.error_code == Success);
auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(FieldId(100));
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index =
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -3141,17 +3169,6 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) {
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
// gen query dataset
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
auto ids = result_on_index->Get<int64_t*>(knowhere::meta::IDS);
auto dis = result_on_index->Get<float*>(knowhere::meta::DISTANCE);
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
vec_dis.push_back(dis[j] * -1);
}
auto binary_set = indexing->Serialize(conf);
void* c_load_index_info = nullptr;
status = NewLoadIndexInfo(&c_load_index_info);
@ -3169,13 +3186,17 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) {
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
// load vec index
auto load_index_info = (LoadIndexInfo*)c_load_index_info;
auto query_dataset2 = knowhere::GenDataset(num_queries, DIM, query_ptr);
auto index = std::dynamic_pointer_cast<knowhere::VecIndex>(load_index_info->index);
auto result_on_index2 = index->Query(query_dataset2, conf, nullptr);
auto ids2 = result_on_index2->Get<int64_t*>(knowhere::meta::IDS);
auto dis2 = result_on_index2->Get<float*>(knowhere::meta::DISTANCE);
status = UpdateSealedSegmentIndex(segment, c_load_index_info);
assert(status.error_code == Success);
// load raw data
auto c_counter_field_data = CLoadFieldDataInfo{
101,
counter_data.data(),
@ -3203,24 +3224,16 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) {
status = LoadFieldData(segment, c_ts_field_data);
assert(status.error_code == Success);
status = UpdateSealedSegmentIndex(segment, c_load_index_info);
assert(status.error_code == Success);
auto counter_index = GenScalarIndexing(N, counter_col.data());
auto counter_index_binary_set = counter_index->Serialize(conf);
CLoadIndexInfo counter_index_info = nullptr;
status = NewLoadIndexInfo(&counter_index_info);
assert(status.error_code == Success);
status = AppendFieldInfo(counter_index_info, 101, CDataType::Int64);
assert(status.error_code == Success);
std::string counter_index_type_key = "index_type";
std::string counter_index_type_value = "sort";
status = AppendIndexParam(counter_index_info, counter_index_type_key.c_str(), counter_index_type_value.c_str());
assert(status.error_code == Success);
status = AppendIndex(counter_index_info, (CBinarySet)&counter_index_binary_set);
assert(status.error_code == Success);
status = UpdateSealedSegmentIndex(segment, counter_index_info);
assert(status.error_code == Success);
// gen query dataset
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
auto ids = result_on_index->Get<int64_t*>(knowhere::meta::IDS);
auto dis = result_on_index->Get<float*>(knowhere::meta::DISTANCE);
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
vec_dis.push_back(dis[j] * -1);
}
CSearchResult c_search_result_on_bigIndex;
auto res_after_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
@ -3239,3 +3252,157 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) {
DeleteCollection(collection);
DeleteSegment(segment);
}
TEST(CApiTest, RetriveScalarFieldFromSealedSegmentWithIndex) {
auto schema = std::make_shared<Schema>();
auto i8_fid = schema->AddDebugField("age8", DataType::INT8);
auto i16_fid = schema->AddDebugField("age16", DataType::INT16);
auto i32_fid = schema->AddDebugField("age32", DataType::INT32);
auto i64_fid = schema->AddDebugField("age64", DataType::INT64);
auto float_fid = schema->AddDebugField("age_float", DataType::FLOAT);
auto double_fid = schema->AddDebugField("age_double", DataType::DOUBLE);
schema->set_primary_field_id(i64_fid);
auto segment = CreateSealedSegment(schema).release();
int N = ROW_COUNT;
auto raw_data = DataGen(schema, N);
LoadIndexInfo load_index_info;
// load timestamp field
FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
auto ts_array = CreateScalarDataArrayFrom(raw_data.timestamps_.data(), N, ts_field_meta);
auto ts_data = serialize(ts_array.get());
auto load_info = CLoadFieldDataInfo{TimestampFieldID.get(), ts_data.data(), ts_data.size(), N};
auto res = LoadFieldData(segment, load_info);
assert(res.error_code == Success);
auto count = GetRowCount(segment);
assert(count == N);
// load rowid field
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
auto row_id_array = CreateScalarDataArrayFrom(raw_data.row_ids_.data(), N, row_id_field_meta);
auto row_id_data = serialize(row_id_array.get());
load_info = CLoadFieldDataInfo{RowFieldID.get(), row_id_data.data(), row_id_data.size(), N};
res = LoadFieldData(segment, load_info);
assert(res.error_code == Success);
count = GetRowCount(segment);
assert(count == N);
// load index for int8 field
auto age8_col = raw_data.get_col<int8_t>(i8_fid);
GenScalarIndexing(N, age8_col.data());
auto age8_index = milvus::scalar::CreateScalarIndexSort<int8_t>();
age8_index->Build(N, age8_col.data());
load_index_info.field_id = i8_fid.get();
load_index_info.field_type = Int8;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int8_t>>(age8_index.release());
segment->LoadIndex(load_index_info);
// load index for 16 field
auto age16_col = raw_data.get_col<int16_t>(i16_fid);
GenScalarIndexing(N, age16_col.data());
auto age16_index = milvus::scalar::CreateScalarIndexSort<int16_t>();
age16_index->Build(N, age16_col.data());
load_index_info.field_id = i16_fid.get();
load_index_info.field_type = Int16;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int16_t>>(age16_index.release());
segment->LoadIndex(load_index_info);
// load index for int32 field
auto age32_col = raw_data.get_col<int32_t>(i32_fid);
GenScalarIndexing(N, age32_col.data());
auto age32_index = milvus::scalar::CreateScalarIndexSort<int32_t>();
age32_index->Build(N, age32_col.data());
load_index_info.field_id = i32_fid.get();
load_index_info.field_type = Int32;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int32_t>>(age32_index.release());
segment->LoadIndex(load_index_info);
// load index for int64 field
auto age64_col = raw_data.get_col<int64_t>(i64_fid);
GenScalarIndexing(N, age64_col.data());
auto age64_index = milvus::scalar::CreateScalarIndexSort<int64_t>();
age64_index->Build(N, age64_col.data());
load_index_info.field_id = i64_fid.get();
load_index_info.field_type = Int64;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int64_t>>(age64_index.release());
segment->LoadIndex(load_index_info);
// load index for float field
auto age_float_col = raw_data.get_col<float>(float_fid);
GenScalarIndexing(N, age_float_col.data());
auto age_float_index = milvus::scalar::CreateScalarIndexSort<float>();
age_float_index->Build(N, age_float_col.data());
load_index_info.field_id = float_fid.get();
load_index_info.field_type = Float;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<float>>(age_float_index.release());
segment->LoadIndex(load_index_info);
// load index for double field
auto age_double_col = raw_data.get_col<double>(double_fid);
GenScalarIndexing(N, age_double_col.data());
auto age_double_index = milvus::scalar::CreateScalarIndexSort<double>();
age_double_index->Build(N, age_double_col.data());
load_index_info.field_id = double_fid.get();
load_index_info.field_type = Float;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<double>>(age_double_index.release());
segment->LoadIndex(load_index_info);
// create retrieve plan
auto plan = std::make_unique<query::RetrievePlan>(*schema);
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
std::vector<int64_t> retrive_row_ids = {age64_col[0]};
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(i64_fid, DataType::INT64, retrive_row_ids);
plan->plan_node_->predicate_ = std::move(term_expr);
std::vector<FieldId> target_field_ids;
// retrieve value
target_field_ids = {i8_fid, i16_fid, i32_fid, i64_fid, float_fid, double_fid};
plan->field_ids_ = target_field_ids;
CRetrieveResult retrieve_result;
res = Retrieve(segment, plan.get(), raw_data.timestamps_[N - 1], &retrieve_result);
ASSERT_EQ(res.error_code, Success);
auto query_result = std::make_unique<proto::segcore::RetrieveResults>();
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
ASSERT_TRUE(suc);
ASSERT_EQ(query_result->fields_data().size(), 6);
auto fields_data = query_result->fields_data();
for (auto iter = fields_data.begin(); iter < fields_data.end(); ++iter) {
switch (iter->type()) {
case proto::schema::DataType::Int8: {
ASSERT_EQ(iter->scalars().int_data().data(0), age8_col[0]);
break;
}
case proto::schema::DataType::Int16: {
ASSERT_EQ(iter->scalars().int_data().data(0), age16_col[0]);
break;
}
case proto::schema::DataType::Int32: {
ASSERT_EQ(iter->scalars().int_data().data(0), age32_col[0]);
break;
}
case proto::schema::DataType::Int64: {
ASSERT_EQ(iter->scalars().long_data().data(0), age64_col[0]);
break;
}
case proto::schema::DataType::Float: {
ASSERT_EQ(iter->scalars().float_data().data(0), age_float_col[0]);
break;
}
case proto::schema::DataType::Double: {
ASSERT_EQ(iter->scalars().double_data().data(0), age_double_col[0]);
break;
}
default: {
PanicInfo("not supported type");
}
}
}
DeleteRetrievePlan(plan.release());
DeleteRetrieveResult(&retrieve_result);
DeleteSegment(segment);
}

View File

@ -20,6 +20,7 @@
#include "query/generated/ExecExprVisitor.h"
#include "segcore/SegmentGrowingImpl.h"
#include "test_utils/DataGen.h"
#include "index/IndexFactory.h"
using namespace milvus;
@ -583,6 +584,183 @@ TEST(Expr, TestCompare) {
}
}
TEST(Expr, TestCompareWithScalarIndex) {
using namespace milvus::query;
using namespace milvus::segcore;
std::vector<std::tuple<std::string, std::function<bool(int, int64_t)>>> testcases = {
{R"(LessThan)", [](int a, int64_t b) { return a < b; }},
{R"(LessEqual)", [](int a, int64_t b) { return a <= b; }},
{R"(GreaterThan)", [](int a, int64_t b) { return a > b; }},
{R"(GreaterEqual)", [](int a, int64_t b) { return a >= b; }},
{R"(Equal)", [](int a, int64_t b) { return a == b; }},
{R"(NotEqual)", [](int a, int64_t b) { return a != b; }},
};
std::string serialized_expr_plan = R"(vector_anns: <
field_id: %1%
predicates: <
compare_expr: <
left_column_info: <
field_id: %3%
data_type: %4%
>
right_column_info: <
field_id: %5%
data_type: %6%
>
op: %2%
>
>
query_info: <
topk: 10
round_decimal: 3
metric_type: "L2"
search_params: "{\"nprobe\": 10}"
>
placeholder_tag: "$0"
>)";
auto schema = std::make_shared<Schema>();
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto i32_fid = schema->AddDebugField("age32", DataType::INT32);
auto i64_fid = schema->AddDebugField("age64", DataType::INT64);
schema->set_primary_field_id(i64_fid);
auto seg = CreateSealedSegment(schema);
int N = 1000;
auto raw_data = DataGen(schema, N);
LoadIndexInfo load_index_info;
// load index for int32 field
auto age32_col = raw_data.get_col<int32_t>(i32_fid);
age32_col[0] = 1000;
GenScalarIndexing(N, age32_col.data());
auto age32_index = milvus::scalar::CreateScalarIndexSort<int32_t>();
age32_index->Build(N, age32_col.data());
load_index_info.field_id = i32_fid.get();
load_index_info.field_type = Int32;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int32_t>>(age32_index.release());
seg->LoadIndex(load_index_info);
// load index for int64 field
auto age64_col = raw_data.get_col<int64_t>(i64_fid);
age64_col[0] = 2000;
GenScalarIndexing(N, age64_col.data());
auto age64_index = milvus::scalar::CreateScalarIndexSort<int64_t>();
age64_index->Build(N, age64_col.data());
load_index_info.field_id = i64_fid.get();
load_index_info.field_type = Int64;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int64_t>>(age64_index.release());
seg->LoadIndex(load_index_info);
ExecExprVisitor visitor(*seg, seg->get_row_count(), MAX_TIMESTAMP);
for (auto [clause, ref_func] : testcases) {
auto dsl_string = boost::format(serialized_expr_plan) % vec_fid.get() % clause % i32_fid.get() %
proto::schema::DataType_Name(int(DataType::INT32)) % i64_fid.get() %
proto::schema::DataType_Name(int(DataType::INT64));
auto binary_plan = translate_text_plan_to_binary_plan(dsl_string.str().data());
auto plan = CreateSearchPlanByExpr(*schema, binary_plan.data(), binary_plan.size());
// std::cout << ShowPlanNodeVisitor().call_child(*plan->plan_node_) << std::endl;
auto final = visitor.call_child(*plan->plan_node_->predicate_.value());
EXPECT_EQ(final.size(), N);
for (int i = 0; i < N; ++i) {
auto ans = final[i];
auto val1 = age32_col[i];
auto val2 = age64_col[i];
auto ref = ref_func(val1, val2);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << boost::format("[%1%, %2%]") % val1 % val2;
}
}
}
TEST(Expr, TestCompareWithScalarIndexMaris) {
using namespace milvus::query;
using namespace milvus::segcore;
std::vector<std::tuple<std::string, std::function<bool(std::string, std::string)>>> testcases = {
{R"(LessThan)", [](std::string a, std::string b) { return a.compare(b) < 0; }},
{R"(LessEqual)", [](std::string a, std::string b) { return a.compare(b) <= 0; }},
{R"(GreaterThan)", [](std::string a, std::string b) { return a.compare(b) > 0; }},
{R"(GreaterEqual)", [](std::string a, std::string b) { return a.compare(b) >= 0; }},
{R"(Equal)", [](std::string a, std::string b) { return a.compare(b) == 0; }},
{R"(NotEqual)", [](std::string a, std::string b) { return a.compare(b) != 0; }},
};
const char* serialized_expr_plan = R"(vector_anns: <
field_id: %1%
predicates: <
compare_expr: <
left_column_info: <
field_id: %3%
data_type: VarChar
>
right_column_info: <
field_id: %4%
data_type: VarChar
>
op: %2%
>
>
query_info: <
topk: 10
round_decimal: 3
metric_type: "L2"
search_params: "{\"nprobe\": 10}"
>
placeholder_tag: "$0"
>)";
auto schema = std::make_shared<Schema>();
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR);
auto str2_fid = schema->AddDebugField("string2", DataType::VARCHAR);
schema->set_primary_field_id(str1_fid);
auto seg = CreateSealedSegment(schema);
int N = 1000;
auto raw_data = DataGen(schema, N);
LoadIndexInfo load_index_info;
// load index for int32 field
auto str1_col = raw_data.get_col<std::string>(str1_fid);
GenScalarIndexing(N, str1_col.data());
auto str1_index = milvus::scalar::CreateScalarIndexSort<std::string>();
str1_index->Build(N, str1_col.data());
load_index_info.field_id = str1_fid.get();
load_index_info.field_type = VarChar;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<std::string>>(str1_index.release());
seg->LoadIndex(load_index_info);
// load index for int64 field
auto str2_col = raw_data.get_col<std::string>(str2_fid);
GenScalarIndexing(N, str2_col.data());
auto str2_index = milvus::scalar::CreateScalarIndexSort<std::string>();
str2_index->Build(N, str2_col.data());
load_index_info.field_id = str2_fid.get();
load_index_info.field_type = VarChar;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<std::string>>(str2_index.release());
seg->LoadIndex(load_index_info);
ExecExprVisitor visitor(*seg, seg->get_row_count(), MAX_TIMESTAMP);
for (auto [clause, ref_func] : testcases) {
auto dsl_string =
boost::format(serialized_expr_plan) % vec_fid.get() % clause % str1_fid.get() % str2_fid.get();
auto binary_plan = translate_text_plan_to_binary_plan(dsl_string.str().data());
auto plan = CreateSearchPlanByExpr(*schema, binary_plan.data(), binary_plan.size());
// std::cout << ShowPlanNodeVisitor().call_child(*plan->plan_node_) << std::endl;
auto final = visitor.call_child(*plan->plan_node_->predicate_.value());
EXPECT_EQ(final.size(), N);
for (int i = 0; i < N; ++i) {
auto ans = final[i];
auto val1 = str1_col[i];
auto val2 = str2_col[i];
auto ref = ref_func(val1, val2);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << boost::format("[%1%, %2%]") % val1 % val2;
}
}
}
TEST(Expr, TestBinaryArithOpEvalRange) {
using namespace milvus::query;
using namespace milvus::segcore;
@ -960,3 +1138,302 @@ TEST(Expr, TestBinaryArithOpEvalRangeExceptions) {
}
}
}
TEST(Expr, TestBinaryArithOpEvalRangeWithScalarSortIndex) {
using namespace milvus::query;
using namespace milvus::segcore;
std::vector<std::tuple<std::string, std::function<bool(int)>, DataType>> testcases = {
// Add test cases for BinaryArithOpEvalRangeExpr EQ of various data types
{R"(arith_op: Add
right_operand: <
int64_val: 4
>
op: Equal
value: <
int64_val: 8
>)",
[](int8_t v) { return (v + 4) == 8; }, DataType::INT8},
{R"(arith_op: Sub
right_operand: <
int64_val: 500
>
op: Equal
value: <
int64_val: 1500
>)",
[](int16_t v) { return (v - 500) == 1500; }, DataType::INT16},
{R"(arith_op: Mul
right_operand: <
int64_val: 2
>
op: Equal
value: <
int64_val: 4000
>)",
[](int32_t v) { return (v * 2) == 4000; }, DataType::INT32},
{R"(arith_op: Div
right_operand: <
int64_val: 2
>
op: Equal
value: <
int64_val: 1000
>)",
[](int64_t v) { return (v / 2) == 1000; }, DataType::INT64},
{R"(arith_op: Mod
right_operand: <
int64_val: 100
>
op: Equal
value: <
int64_val: 0
>)",
[](int32_t v) { return (v % 100) == 0; }, DataType::INT32},
{R"(arith_op: Add
right_operand: <
float_val: 500
>
op: Equal
value: <
float_val: 2500
>)",
[](float v) { return (v + 500) == 2500; }, DataType::FLOAT},
{R"(arith_op: Add
right_operand: <
float_val: 500
>
op: Equal
value: <
float_val: 2500
>)",
[](double v) { return (v + 500) == 2500; }, DataType::DOUBLE},
{R"(arith_op: Add
right_operand: <
float_val: 500
>
op: NotEqual
value: <
float_val: 2000
>)",
[](float v) { return (v + 500) != 2000; }, DataType::FLOAT},
{R"(arith_op: Sub
right_operand: <
float_val: 500
>
op: NotEqual
value: <
float_val: 2500
>)",
[](double v) { return (v - 500) != 2000; }, DataType::DOUBLE},
{R"(arith_op: Mul
right_operand: <
int64_val: 2
>
op: NotEqual
value: <
int64_val: 2
>)",
[](int8_t v) { return (v * 2) != 2; }, DataType::INT8},
{R"(arith_op: Div
right_operand: <
int64_val: 2
>
op: NotEqual
value: <
int64_val: 2000
>)",
[](int16_t v) { return (v / 2) != 2000; }, DataType::INT16},
{R"(arith_op: Mod
right_operand: <
int64_val: 100
>
op: NotEqual
value: <
int64_val: 1
>)",
[](int32_t v) { return (v % 100) != 1; }, DataType::INT32},
{R"(arith_op: Add
right_operand: <
int64_val: 500
>
op: NotEqual
value: <
int64_val: 2000
>)",
[](int64_t v) { return (v + 500) != 2000; }, DataType::INT64},
};
std::string serialized_expr_plan = R"(vector_anns: <
field_id: %1%
predicates: <
binary_arith_op_eval_range_expr: <
@@@@@
>
>
query_info: <
topk: 10
round_decimal: 3
metric_type: "L2"
search_params: "{\"nprobe\": 10}"
>
placeholder_tag: "$0"
>)";
std::string arith_expr = R"(
column_info: <
field_id: %2%
data_type: %3%
>
@@@@)";
auto schema = std::make_shared<Schema>();
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto i8_fid = schema->AddDebugField("age8", DataType::INT8);
auto i16_fid = schema->AddDebugField("age16", DataType::INT16);
auto i32_fid = schema->AddDebugField("age32", DataType::INT32);
auto i64_fid = schema->AddDebugField("age64", DataType::INT64);
auto float_fid = schema->AddDebugField("age_float", DataType::FLOAT);
auto double_fid = schema->AddDebugField("age_double", DataType::DOUBLE);
schema->set_primary_field_id(i64_fid);
auto seg = CreateSealedSegment(schema);
int N = 1000;
auto raw_data = DataGen(schema, N);
LoadIndexInfo load_index_info;
// load index for int8 field
auto age8_col = raw_data.get_col<int8_t>(i8_fid);
age8_col[0] = 4;
GenScalarIndexing(N, age8_col.data());
auto age8_index = milvus::scalar::CreateScalarIndexSort<int8_t>();
age8_index->Build(N, age8_col.data());
load_index_info.field_id = i8_fid.get();
load_index_info.field_type = Int8;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int8_t>>(age8_index.release());
seg->LoadIndex(load_index_info);
// load index for 16 field
auto age16_col = raw_data.get_col<int16_t>(i16_fid);
age16_col[0] = 2000;
GenScalarIndexing(N, age16_col.data());
auto age16_index = milvus::scalar::CreateScalarIndexSort<int16_t>();
age16_index->Build(N, age16_col.data());
load_index_info.field_id = i16_fid.get();
load_index_info.field_type = Int16;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int16_t>>(age16_index.release());
seg->LoadIndex(load_index_info);
// load index for int32 field
auto age32_col = raw_data.get_col<int32_t>(i32_fid);
age32_col[0] = 2000;
GenScalarIndexing(N, age32_col.data());
auto age32_index = milvus::scalar::CreateScalarIndexSort<int32_t>();
age32_index->Build(N, age32_col.data());
load_index_info.field_id = i32_fid.get();
load_index_info.field_type = Int32;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int32_t>>(age32_index.release());
seg->LoadIndex(load_index_info);
// load index for int64 field
auto age64_col = raw_data.get_col<int64_t>(i64_fid);
age64_col[0] = 2000;
GenScalarIndexing(N, age64_col.data());
auto age64_index = milvus::scalar::CreateScalarIndexSort<int64_t>();
age64_index->Build(N, age64_col.data());
load_index_info.field_id = i64_fid.get();
load_index_info.field_type = Int64;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int64_t>>(age64_index.release());
seg->LoadIndex(load_index_info);
// load index for float field
auto age_float_col = raw_data.get_col<float>(float_fid);
age_float_col[0] = 2000;
GenScalarIndexing(N, age_float_col.data());
auto age_float_index = milvus::scalar::CreateScalarIndexSort<float>();
age_float_index->Build(N, age_float_col.data());
load_index_info.field_id = float_fid.get();
load_index_info.field_type = Float;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<float>>(age_float_index.release());
seg->LoadIndex(load_index_info);
// load index for double field
auto age_double_col = raw_data.get_col<double>(double_fid);
age_double_col[0] = 2000;
GenScalarIndexing(N, age_double_col.data());
auto age_double_index = milvus::scalar::CreateScalarIndexSort<double>();
age_double_index->Build(N, age_double_col.data());
load_index_info.field_id = double_fid.get();
load_index_info.field_type = Float;
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<double>>(age_double_index.release());
seg->LoadIndex(load_index_info);
auto seg_promote = dynamic_cast<SegmentSealedImpl*>(seg.get());
ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP);
int offset = 0;
for (auto [clause, ref_func, dtype] : testcases) {
auto loc = serialized_expr_plan.find("@@@@@");
auto expr_plan = serialized_expr_plan;
expr_plan.replace(loc, 5, arith_expr);
loc = expr_plan.find("@@@@");
expr_plan.replace(loc, 4, clause);
boost::format expr;
if (dtype == DataType::INT8) {
expr = boost::format(expr_plan) % vec_fid.get() % i8_fid.get() %
proto::schema::DataType_Name(int(DataType::INT8));
} else if (dtype == DataType::INT16) {
expr = boost::format(expr_plan) % vec_fid.get() % i16_fid.get() %
proto::schema::DataType_Name(int(DataType::INT16));
} else if (dtype == DataType::INT32) {
expr = boost::format(expr_plan) % vec_fid.get() % i32_fid.get() %
proto::schema::DataType_Name(int(DataType::INT32));
} else if (dtype == DataType::INT64) {
expr = boost::format(expr_plan) % vec_fid.get() % i64_fid.get() %
proto::schema::DataType_Name(int(DataType::INT64));
} else if (dtype == DataType::FLOAT) {
expr = boost::format(expr_plan) % vec_fid.get() % float_fid.get() %
proto::schema::DataType_Name(int(DataType::FLOAT));
} else if (dtype == DataType::DOUBLE) {
expr = boost::format(expr_plan) % vec_fid.get() % double_fid.get() %
proto::schema::DataType_Name(int(DataType::DOUBLE));
} else {
ASSERT_TRUE(false) << "No test case defined for this data type";
}
auto binary_plan = translate_text_plan_to_binary_plan(expr.str().data());
auto plan = CreateSearchPlanByExpr(*schema, binary_plan.data(), binary_plan.size());
auto final = visitor.call_child(*plan->plan_node_->predicate_.value());
EXPECT_EQ(final.size(), N);
for (int i = 0; i < N; ++i) {
auto ans = final[i];
if (dtype == DataType::INT8) {
auto val = age8_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::INT16) {
auto val = age16_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::INT32) {
auto val = age32_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::INT64) {
auto val = age64_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::FLOAT) {
auto val = age_float_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::DOUBLE) {
auto val = age_double_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else {
ASSERT_TRUE(false) << "No test case defined for this data type";
}
}
}
}

View File

@ -660,8 +660,8 @@ TEST(Query, FillSegment) {
}());
segments.emplace_back([&] {
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
// auto indexing = GenIndexing(N, dim, std_vfloat_vec.data());
SealedLoadFieldData(dataset, *segment);
// auto indexing = GenVecIndexing(N, dim, std_vfloat_vec.data());
// LoadIndexInfo info;
// auto field_offset = schema->get_offset(FieldName("fakevec"));

View File

@ -57,7 +57,7 @@ TEST(Retrieve, AutoID) {
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
SealedLoadFieldData(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
@ -107,7 +107,7 @@ TEST(Retrieve, AutoID2) {
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
SealedLoadFieldData(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
@ -153,7 +153,7 @@ TEST(Retrieve, NotExist) {
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
SealedLoadFieldData(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
@ -236,7 +236,7 @@ TEST(Retrieve, LargeTimestamp) {
uint64_t ts_offset = 100;
auto dataset = DataGen(schema, N, 42, ts_offset + 1);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
SealedLoadFieldData(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
@ -285,7 +285,7 @@ TEST(Retrieve, Delete) {
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
SealedLoadFieldData(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);

View File

@ -88,6 +88,18 @@ TYPED_TEST_P(TypedScalarIndexTest, NotIn) {
}
}
TYPED_TEST_P(TypedScalarIndexTest, Reverse) {
using T = TypeParam;
auto dtype = milvus::GetDType<T>();
auto index_types = GetIndexTypes<T>();
for (const auto& index_type : index_types) {
auto index = milvus::scalar::IndexFactory::GetInstance().CreateIndex<T>(index_type);
auto arr = GenArr<T>(nb);
index->Build(nb, arr.data());
assert_reverse<T>(index, arr);
}
}
TYPED_TEST_P(TypedScalarIndexTest, Range) {
using T = TypeParam;
auto dtype = milvus::GetDType<T>();
@ -123,6 +135,6 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) {
// TODO: it's easy to overflow for int8_t. Design more reasonable ut.
using ScalarT = ::testing::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTest, Dummy, Constructor, Count, In, NotIn, Range, Codec);
REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTest, Dummy, Constructor, Count, In, NotIn, Range, Codec, Reverse);
INSTANTIATE_TYPED_TEST_CASE_P(ArithmeticCheck, TypedScalarIndexTest, ScalarT);

View File

@ -110,7 +110,11 @@ TEST(Sealed, without_predicate) {
load_info.index = indexing;
load_info.index_params["metric_type"] = "L2";
auto sealed_segment = SealedCreator(schema, dataset, load_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(fake_id);
sealed_segment->LoadIndex(load_info);
sr = sealed_segment->Search(plan.get(), *ph_group, time);
auto post_result = SearchResultToJson(*sr);
@ -202,7 +206,11 @@ TEST(Sealed, with_predicate) {
load_info.index = indexing;
load_info.index_params["metric_type"] = "L2";
auto sealed_segment = SealedCreator(schema, dataset, load_info);
// load index for vec field, load raw data for scalar filed
auto sealed_segment = SealedCreator(schema, dataset);
sealed_segment->DropFieldData(fake_id);
sealed_segment->LoadIndex(load_info);
sr = sealed_segment->Search(plan.get(), *ph_group, time);
for (int i = 0; i < num_queries; ++i) {
@ -229,7 +237,7 @@ TEST(Sealed, LoadFieldData) {
auto fakevec = dataset.get_col<float>(fakevec_id);
auto indexing = GenIndexing(N, dim, fakevec.data());
auto indexing = GenVecIndexing(N, dim, fakevec.data());
auto segment = CreateSealedSegment(schema);
std::string dsl = R"({
@ -268,7 +276,7 @@ TEST(Sealed, LoadFieldData) {
ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
SealedLoader(dataset, *segment);
SealedLoadFieldData(dataset, *segment);
segment->DropFieldData(nothing_id);
segment->Search(plan.get(), *ph_group, time);
@ -282,8 +290,8 @@ TEST(Sealed, LoadFieldData) {
segment->LoadIndex(vec_info);
ASSERT_EQ(segment->num_chunk(), 1);
ASSERT_EQ(segment->num_chunk_index(double_id), 1);
ASSERT_EQ(segment->num_chunk_index(str_id), 1);
ASSERT_EQ(segment->num_chunk_index(double_id), 0);
ASSERT_EQ(segment->num_chunk_index(str_id), 0);
auto chunk_span1 = segment->chunk_data<int64_t>(counter_id, 0);
auto chunk_span2 = segment->chunk_data<double>(double_id, 0);
auto chunk_span3 = segment->chunk_data<std::string>(str_id, 0);
@ -349,7 +357,7 @@ TEST(Sealed, LoadScalarIndex) {
auto fakevec = dataset.get_col<float>(fakevec_id);
auto indexing = GenIndexing(N, dim, fakevec.data());
auto indexing = GenVecIndexing(N, dim, fakevec.data());
auto segment = CreateSealedSegment(schema);
std::string dsl = R"({
@ -386,7 +394,21 @@ TEST(Sealed, LoadScalarIndex) {
auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024);
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
SealedLoader(dataset, *segment);
LoadFieldDataInfo row_id_info;
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
auto array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta);
row_id_info.field_data = array.release();
row_id_info.row_count = dataset.row_ids_.size();
row_id_info.field_id = RowFieldID.get(); // field id for RowId
segment->LoadFieldData(row_id_info);
LoadFieldDataInfo ts_info;
FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta);
ts_info.field_data = array.release();
ts_info.row_count = dataset.timestamps_.size();
ts_info.field_id = TimestampFieldID.get();
segment->LoadFieldData(ts_info);
LoadIndexInfo vec_info;
vec_info.field_id = fakevec_id.get();
@ -477,7 +499,7 @@ TEST(Sealed, Delete) {
ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
SealedLoader(dataset, *segment);
SealedLoadFieldData(dataset, *segment);
int64_t row_count = 5;
std::vector<idx_t> pks{1, 2, 3, 4, 5};

View File

@ -87,10 +87,52 @@ TEST_F(StringIndexMarisaTest, NotIn) {
TEST_F(StringIndexMarisaTest, Range) {
auto index = milvus::scalar::CreateStringIndexMarisa();
std::vector<std::string> strings(nb);
for (int i = 0; i < nb; ++i) {
strings[i] = std::to_string(std::rand() % 10);
}
*str_arr.mutable_data() = {strings.begin(), strings.end()};
str_ds = GenDsFromPB(str_arr);
index->BuildWithDataset(str_ds);
ASSERT_ANY_THROW(index->Range("not important", milvus::OpType::LessEqual));
ASSERT_ANY_THROW(index->Range("not important", true, "not important", true));
{
auto bitset = index->Range("0", milvus::OpType::GreaterEqual);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
auto bitset = index->Range("90", milvus::OpType::LessThan);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
auto bitset = index->Range("9", milvus::OpType::LessEqual);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
auto bitset = index->Range("0", true, "9", true);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
auto bitset = index->Range("0", true, "90", false);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
}
TEST_F(StringIndexMarisaTest, Reverse) {
auto index_types = GetIndexTypes<std::string>();
for (const auto& index_type : index_types) {
auto index = milvus::scalar::IndexFactory::GetInstance().CreateIndex<std::string>(index_type);
index->BuildWithDataset(str_ds);
assert_reverse<std::string>(index, strs);
}
}
TEST_F(StringIndexMarisaTest, PrefixMatch) {
@ -126,18 +168,21 @@ TEST_F(StringIndexMarisaTest, Query) {
{
auto ds = std::make_shared<knowhere::Dataset>();
ds->Set<milvus::OpType>(milvus::scalar::OPERATOR_TYPE, milvus::OpType::GreaterEqual);
ds->Set<std::string>(milvus::scalar::RANGE_VALUE, "range");
ASSERT_ANY_THROW(index->Query(ds));
ds->Set<std::string>(milvus::scalar::RANGE_VALUE, "0");
auto bitset = index->Query(ds);
ASSERT_EQ(bitset->size(), strs.size());
ASSERT_EQ(bitset->count(), strs.size());
}
{
auto ds = std::make_shared<knowhere::Dataset>();
ds->Set<milvus::OpType>(milvus::scalar::OPERATOR_TYPE, milvus::OpType::Range);
ds->Set<std::string>(milvus::scalar::LOWER_BOUND_VALUE, "range");
ds->Set<std::string>(milvus::scalar::LOWER_BOUND_VALUE, "0");
ds->Set<std::string>(milvus::scalar::UPPER_BOUND_VALUE, "range");
ds->Set<bool>(milvus::scalar::LOWER_BOUND_INCLUSIVE, true);
ds->Set<bool>(milvus::scalar::UPPER_BOUND_INCLUSIVE, true);
ASSERT_ANY_THROW(index->Query(ds));
auto bitset = index->Query(ds);
ASSERT_TRUE(bitset->any());
}
{
@ -154,6 +199,12 @@ TEST_F(StringIndexMarisaTest, Query) {
TEST_F(StringIndexMarisaTest, Codec) {
auto index = milvus::scalar::CreateStringIndexMarisa();
std::vector<std::string> strings(nb);
for (int i = 0; i < nb; ++i) {
strings[i] = std::to_string(std::rand() % 10);
}
*str_arr.mutable_data() = {strings.begin(), strings.end()};
str_ds = GenDsFromPB(str_arr);
index->BuildWithDataset(str_ds);
auto copy_index = milvus::scalar::CreateStringIndexMarisa();
@ -164,27 +215,52 @@ TEST_F(StringIndexMarisaTest, Codec) {
}
{
auto bitset = copy_index->In(strs.size(), strs.data());
ASSERT_EQ(bitset->size(), strs.size());
auto bitset = copy_index->In(nb, strings.data());
ASSERT_EQ(bitset->size(), nb);
ASSERT_TRUE(bitset->any());
}
{
auto bitset = copy_index->NotIn(strs.size(), strs.data());
ASSERT_EQ(bitset->size(), strs.size());
auto bitset = copy_index->NotIn(nb, strings.data());
ASSERT_EQ(bitset->size(), nb);
ASSERT_TRUE(bitset->none());
}
{
ASSERT_ANY_THROW(copy_index->Range("not important", milvus::OpType::LessEqual));
ASSERT_ANY_THROW(copy_index->Range("not important", true, "not important", true));
auto bitset = copy_index->Range("0", milvus::OpType::GreaterEqual);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
for (size_t i = 0; i < strs.size(); i++) {
auto str = strs[i];
auto bitset = copy_index->Range("90", milvus::OpType::LessThan);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
auto bitset = copy_index->Range("9", milvus::OpType::LessEqual);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
auto bitset = copy_index->Range("0", true, "9", true);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
auto bitset = copy_index->Range("0", true, "90", false);
ASSERT_EQ(bitset->size(), nb);
ASSERT_EQ(bitset->count(), nb);
}
{
for (size_t i = 0; i < nb; i++) {
auto str = strings[i];
auto bitset = copy_index->PrefixMatch(str);
ASSERT_EQ(bitset->size(), strs.size());
ASSERT_EQ(bitset->size(), nb);
ASSERT_TRUE(bitset->test(i));
}
}

View File

@ -18,6 +18,20 @@
using milvus::scalar::ScalarIndexPtr;
namespace {
bool
compare_float(float x, float y, float epsilon = 0.000001f) {
if (fabs(x - y) < epsilon)
return true;
return false;
}
bool
compare_double(double x, double y, double epsilon = 0.000001f) {
if (fabs(x - y) < epsilon)
return true;
return false;
}
template <typename T>
inline void
assert_in(const ScalarIndexPtr<T>& index, const std::vector<T>& arr) {
@ -74,6 +88,38 @@ assert_range(const ScalarIndexPtr<T>& index, const std::vector<T>& arr) {
ASSERT_TRUE(bitset5->any());
}
template <typename T>
inline void
assert_reverse(const ScalarIndexPtr<T>& index, const std::vector<T>& arr) {
for (size_t offset = 0; offset < arr.size(); ++offset) {
ASSERT_EQ(index->Reverse_Lookup(offset), arr[offset]);
}
}
template <>
inline void
assert_reverse(const ScalarIndexPtr<float>& index, const std::vector<float>& arr) {
for (size_t offset = 0; offset < arr.size(); ++offset) {
ASSERT_TRUE(compare_float(index->Reverse_Lookup(offset), arr[offset]));
}
}
template <>
inline void
assert_reverse(const ScalarIndexPtr<double>& index, const std::vector<double>& arr) {
for (size_t offset = 0; offset < arr.size(); ++offset) {
ASSERT_TRUE(compare_double(index->Reverse_Lookup(offset), arr[offset]));
}
}
template <>
inline void
assert_reverse(const ScalarIndexPtr<std::string>& index, const std::vector<std::string>& arr) {
for (size_t offset = 0; offset < arr.size(); ++offset) {
ASSERT_TRUE(arr[offset].compare(index->Reverse_Lookup(offset)) == 0);
}
}
template <>
inline void
assert_in(const ScalarIndexPtr<std::string>& index, const std::vector<std::string>& arr) {

View File

@ -15,6 +15,7 @@
#include <cstring>
#include <memory>
#include <random>
#include "google/protobuf/text_format.h"
#include <knowhere/index/vector_index/VecIndex.h>
#include <knowhere/index/vector_index/adapter/VectorAdapter.h>
#include <knowhere/index/vector_index/VecIndexFactory.h>
@ -97,8 +98,10 @@ struct GeneratedData {
break;
}
case DataType::VARCHAR: {
auto src_data = reinterpret_cast<const T*>(target_field_data.scalars().string_data().data().data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
auto ret_data = reinterpret_cast<std::string*>(ret.data());
auto src_data = target_field_data.scalars().string_data().data();
std::copy(src_data.begin(), src_data.end(), ret_data);
break;
}
default: {
@ -354,8 +357,7 @@ SearchResultToJson(const SearchResult& sr) {
};
inline void
SealedLoader(const GeneratedData& dataset, SegmentSealed& seg) {
// TODO
SealedLoadFieldData(const GeneratedData& dataset, SegmentSealed& seg) {
auto row_count = dataset.row_ids_.size();
{
LoadFieldDataInfo info;
@ -385,15 +387,14 @@ SealedLoader(const GeneratedData& dataset, SegmentSealed& seg) {
}
inline std::unique_ptr<SegmentSealed>
SealedCreator(SchemaPtr schema, const GeneratedData& dataset, const LoadIndexInfo& index_info) {
SealedCreator(SchemaPtr schema, const GeneratedData& dataset) {
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
segment->LoadIndex(index_info);
SealedLoadFieldData(dataset, *segment);
return segment;
}
inline knowhere::VecIndexPtr
GenIndexing(int64_t N, int64_t dim, const float* vec) {
GenVecIndexing(int64_t N, int64_t dim, const float* vec) {
// {knowhere::IndexParams::nprobe, 10},
auto conf = knowhere::Config{{knowhere::meta::DIM, dim},
{knowhere::IndexParams::nlist, 1024},
@ -420,4 +421,20 @@ GenScalarIndexing(int64_t N, const T* data) {
}
}
inline std::vector<char>
translate_text_plan_to_binary_plan(const char* text_plan) {
proto::plan::PlanNode plan_node;
auto ok = google::protobuf::TextFormat::ParseFromString(text_plan, &plan_node);
AssertInfo(ok, "Failed to parse");
std::string binary_plan;
plan_node.SerializeToString(&binary_plan);
std::vector<char> ret;
ret.resize(binary_plan.size());
std::memcpy(ret.data(), binary_plan.c_str(), binary_plan.size());
return ret;
}
} // namespace milvus::segcore

View File

@ -828,7 +828,7 @@ func (s *Segment) segmentLoadIndexData(bytesIndex [][]byte, indexInfo *querypb.F
return err
}
log.Info("updateSegmentIndex done", zap.Int64("segmentID", s.ID()))
log.Info("updateSegmentIndex done", zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", indexInfo.FieldID))
return nil
}

View File

@ -253,12 +253,6 @@ func (loader *segmentLoader) loadSegmentInternal(segment *Segment,
indexInfo: indexInfo,
}
indexedFieldInfos[fieldID] = fieldInfo
if fieldBinlog.FieldID == pkFieldID {
// pk field data should always be loaded.
// segCore need a map (pk data -> offset)
fieldBinlogs = append(fieldBinlogs, fieldBinlog)
}
} else {
fieldBinlogs = append(fieldBinlogs, fieldBinlog)
}
@ -402,7 +396,7 @@ func (loader *segmentLoader) loadIndexedFieldData(segment *Segment, vecFieldInfo
if err != nil {
return err
}
log.Debug("load vector field's index data done", zap.Int64("segmentID", segment.ID()), zap.Int64("fieldID", fieldID))
log.Debug("load field's index data done", zap.Int64("segmentID", segment.ID()), zap.Int64("fieldID", fieldID))
}
segment.setIndexedFieldInfo(fieldID, fieldInfo)
}