diff --git a/internal/core/src/index/CMakeLists.txt b/internal/core/src/index/CMakeLists.txt index 489cc9475d..c0029808e6 100644 --- a/internal/core/src/index/CMakeLists.txt +++ b/internal/core/src/index/CMakeLists.txt @@ -24,6 +24,7 @@ endif () target_link_libraries(milvus_index milvus_proto + milvus_exceptions knowhere ${PLATFORM_LIBS} ) diff --git a/internal/core/src/index/ScalarIndex.h b/internal/core/src/index/ScalarIndex.h index 3147e5b594..6aa67b7a7a 100644 --- a/internal/core/src/index/ScalarIndex.h +++ b/internal/core/src/index/ScalarIndex.h @@ -38,6 +38,9 @@ class ScalarIndex : public IndexBase { virtual const TargetBitmapPtr Range(T lower_bound_value, bool lb_inclusive, T upper_bound_value, bool ub_inclusive) = 0; + virtual T + Reverse_Lookup(size_t offset) const = 0; + const TargetBitmapPtr Query(const DatasetPtr& dataset) override; }; diff --git a/internal/core/src/index/ScalarIndexSort-inl.h b/internal/core/src/index/ScalarIndexSort-inl.h index 4a5ffcbc45..b25659c079 100644 --- a/internal/core/src/index/ScalarIndexSort-inl.h +++ b/internal/core/src/index/ScalarIndexSort-inl.h @@ -42,6 +42,7 @@ template inline void ScalarIndexSort::Build(const size_t n, const T* values) { data_.reserve(n); + idx_to_offsets_.resize(n); T* p = const_cast(values); for (size_t i = 0; i < n; ++i) { data_.emplace_back(IndexStructure(*p++, i)); @@ -59,15 +60,16 @@ ScalarIndexSort::build() { throw std::invalid_argument("ScalarIndexSort cannot build null values!"); } std::sort(data_.begin(), data_.end()); + for (size_t i = 0; i < data_.size(); ++i) { + idx_to_offsets_[data_[i].idx_] = i; + } is_built_ = true; } template inline BinarySet ScalarIndexSort::Serialize(const Config& config) { - if (!is_built_) { - build(); - } + AssertInfo(is_built_, "index has not been built"); auto index_data_size = data_.size() * sizeof(IndexStructure); std::shared_ptr index_data(new uint8_t[index_data_size]); @@ -92,16 +94,18 @@ ScalarIndexSort::Load(const BinarySet& index_binary) { auto index_data = index_binary.GetByName("index_data"); data_.resize(index_size); + idx_to_offsets_.resize(index_size); memcpy(data_.data(), index_data->data.get(), (size_t)index_data->size); + for (size_t i = 0; i < data_.size(); ++i) { + idx_to_offsets_[data_[i].idx_] = i; + } is_built_ = true; } template inline const TargetBitmapPtr ScalarIndexSort::In(const size_t n, const T* values) { - if (!is_built_) { - build(); - } + AssertInfo(is_built_, "index has not been built"); TargetBitmapPtr bitset = std::make_unique(data_.size()); for (size_t i = 0; i < n; ++i) { auto lb = std::lower_bound(data_.begin(), data_.end(), IndexStructure(*(values + i))); @@ -120,9 +124,7 @@ ScalarIndexSort::In(const size_t n, const T* values) { template inline const TargetBitmapPtr ScalarIndexSort::NotIn(const size_t n, const T* values) { - if (!is_built_) { - build(); - } + AssertInfo(is_built_, "index has not been built"); TargetBitmapPtr bitset = std::make_unique(data_.size()); bitset->set(); for (size_t i = 0; i < n; ++i) { @@ -142,9 +144,7 @@ ScalarIndexSort::NotIn(const size_t n, const T* values) { template inline const TargetBitmapPtr ScalarIndexSort::Range(const T value, const OpType op) { - if (!is_built_) { - build(); - } + AssertInfo(is_built_, "index has not been built"); TargetBitmapPtr bitset = std::make_unique(data_.size()); auto lb = data_.begin(); auto ub = data_.end(); @@ -173,13 +173,11 @@ ScalarIndexSort::Range(const T value, const OpType op) { template inline const TargetBitmapPtr ScalarIndexSort::Range(T lower_bound_value, bool lb_inclusive, T upper_bound_value, bool ub_inclusive) { - if (!is_built_) { - build(); - } + AssertInfo(is_built_, "index has not been built"); TargetBitmapPtr bitset = std::make_unique(data_.size()); - if (lower_bound_value > upper_bound_value) { - std::swap(lower_bound_value, upper_bound_value); - std::swap(lb_inclusive, ub_inclusive); + if (lower_bound_value > upper_bound_value || + (lower_bound_value == upper_bound_value && !(lb_inclusive && ub_inclusive))) { + return bitset; } auto lb = data_.begin(); auto ub = data_.end(); @@ -199,4 +197,14 @@ ScalarIndexSort::Range(T lower_bound_value, bool lb_inclusive, T upper_bound_ return bitset; } +template +inline T +ScalarIndexSort::Reverse_Lookup(size_t idx) const { + AssertInfo(idx < idx_to_offsets_.size(), "out of range of total count"); + AssertInfo(is_built_, "index has not been built"); + + auto offset = idx_to_offsets_[idx]; + return data_[offset].a_; +} + } // namespace milvus::scalar diff --git a/internal/core/src/index/ScalarIndexSort.h b/internal/core/src/index/ScalarIndexSort.h index eea44c84cb..0d1f5355e1 100644 --- a/internal/core/src/index/ScalarIndexSort.h +++ b/internal/core/src/index/ScalarIndexSort.h @@ -15,9 +15,9 @@ #include #include #include +#include #include "knowhere/common/Exception.h" #include "index/IndexStructure.h" -#include #include "index/ScalarIndex.h" namespace milvus::scalar { @@ -60,6 +60,9 @@ class ScalarIndexSort : public ScalarIndex { const TargetBitmapPtr Range(T lower_bound_value, bool lb_inclusive, T upper_bound_value, bool ub_inclusive) override; + T + Reverse_Lookup(size_t offset) const override; + public: const std::vector>& GetData() { @@ -78,6 +81,7 @@ class ScalarIndexSort : public ScalarIndex { private: bool is_built_; + std::vector idx_to_offsets_; // used to retrieve. std::vector> data_; }; diff --git a/internal/core/src/index/StringIndexMarisa.cpp b/internal/core/src/index/StringIndexMarisa.cpp index 85e4ca1555..f29f78d220 100644 --- a/internal/core/src/index/StringIndexMarisa.cpp +++ b/internal/core/src/index/StringIndexMarisa.cpp @@ -9,10 +9,6 @@ // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License -#include "index/StringIndexMarisa.h" -#include "index/Utils.h" -#include "index/Index.h" - #include #include #include @@ -20,7 +16,11 @@ #include #include #include -#include "exceptions/EasyAssert.h" + +#include "index/StringIndexMarisa.h" +#include "index/Utils.h" +#include "index/Index.h" +#include "common/Utils.h" namespace milvus::scalar { @@ -150,7 +150,35 @@ StringIndexMarisa::NotIn(size_t n, const std::string* values) { const TargetBitmapPtr StringIndexMarisa::Range(std::string value, OpType op) { - throw std::runtime_error("todo: unsupported now"); + auto count = Count(); + TargetBitmapPtr bitset = std::make_unique(count); + marisa::Agent agent; + for (size_t offset = 0; offset < count; ++offset) { + agent.set_query(str_ids_[offset]); + trie_.reverse_lookup(agent); + std::string raw_data(agent.key().ptr(), agent.key().length()); + bool set = false; + switch (op) { + case OpType::LessThan: + set = raw_data.compare(value) < 0; + break; + case OpType::LessEqual: + set = raw_data.compare(value) <= 0; + break; + case OpType::GreaterThan: + set = raw_data.compare(value) > 0; + break; + case OpType::GreaterEqual: + set = raw_data.compare(value) >= 0; + break; + default: + throw std::invalid_argument(std::string("Invalid OperatorType: ") + std::to_string((int)op) + "!"); + } + if (set) { + bitset->set(offset); + } + } + return bitset; } const TargetBitmapPtr @@ -158,7 +186,33 @@ StringIndexMarisa::Range(std::string lower_bound_value, bool lb_inclusive, std::string upper_bound_value, bool ub_inclusive) { - throw std::runtime_error("todo: unsupported now"); + auto count = Count(); + TargetBitmapPtr bitset = std::make_unique(count); + if (lower_bound_value.compare(upper_bound_value) > 0 || + (lower_bound_value.compare(upper_bound_value) == 0 && !(lb_inclusive && ub_inclusive))) { + return bitset; + } + marisa::Agent agent; + for (size_t offset = 0; offset < count; ++offset) { + agent.set_query(str_ids_[offset]); + trie_.reverse_lookup(agent); + std::string raw_data(agent.key().ptr(), agent.key().length()); + bool set = true; + if (lb_inclusive) { + set &= raw_data.compare(lower_bound_value) >= 0; + } else { + set &= raw_data.compare(lower_bound_value) > 0; + } + if (ub_inclusive) { + set &= raw_data.compare(upper_bound_value) <= 0; + } else { + set &= raw_data.compare(upper_bound_value) < 0; + } + if (set) { + bitset->set(offset); + } + } + return bitset; } const TargetBitmapPtr @@ -215,6 +269,15 @@ StringIndexMarisa::prefix_match(const std::string& prefix) { return ret; } +std::string +StringIndexMarisa::Reverse_Lookup(size_t offset) const { + AssertInfo(offset < str_ids_.size(), "out of range of total count"); + marisa::Agent agent; + agent.set_query(str_ids_[offset]); + trie_.reverse_lookup(agent); + return std::string(agent.key().ptr(), agent.key().length()); +} + #endif } // namespace milvus::scalar diff --git a/internal/core/src/index/StringIndexMarisa.h b/internal/core/src/index/StringIndexMarisa.h index 662c3aa192..eb0f175d69 100644 --- a/internal/core/src/index/StringIndexMarisa.h +++ b/internal/core/src/index/StringIndexMarisa.h @@ -58,6 +58,9 @@ class StringIndexMarisa : public StringIndex { const TargetBitmapPtr PrefixMatch(std::string prefix) override; + std::string + Reverse_Lookup(size_t offset) const override; + private: void fill_str_ids(size_t n, const std::string* values); diff --git a/internal/core/src/query/Expr.h b/internal/core/src/query/Expr.h index fdbb620b4f..4dafb5ef3e 100644 --- a/internal/core/src/query/Expr.h +++ b/internal/core/src/query/Expr.h @@ -29,6 +29,8 @@ namespace milvus::query { +using optype = proto::plan::OpType; + class ExprVisitor; // Base of all Exprs diff --git a/internal/core/src/query/generated/ExecExprVisitor.h b/internal/core/src/query/generated/ExecExprVisitor.h index 7853761183..65bf6ef39f 100644 --- a/internal/core/src/query/generated/ExecExprVisitor.h +++ b/internal/core/src/query/generated/ExecExprVisitor.h @@ -64,9 +64,9 @@ class ExecExprVisitor : public ExprVisitor { auto ExecRangeVisitorImpl(FieldId field_id, IndexFunc func, ElementFunc element_func) -> BitsetType; - template + template auto - ExecDataRangeVisitorImpl(FieldId field_id, ElementFunc element_func) -> BitsetType; + ExecDataRangeVisitorImpl(FieldId field_id, IndexFunc index_func, ElementFunc element_func) -> BitsetType; template auto diff --git a/internal/core/src/query/visitors/ExecExprVisitor.cpp b/internal/core/src/query/visitors/ExecExprVisitor.cpp index d485d87eb2..f825df4e8e 100644 --- a/internal/core/src/query/visitors/ExecExprVisitor.cpp +++ b/internal/core/src/query/visitors/ExecExprVisitor.cpp @@ -179,16 +179,25 @@ ExecExprVisitor::ExecRangeVisitorImpl(FieldId field_id, IndexFunc index_func, El return final_result; } -template +template auto -ExecExprVisitor::ExecDataRangeVisitorImpl(FieldId field_id, ElementFunc element_func) -> BitsetType { +ExecExprVisitor::ExecDataRangeVisitorImpl(FieldId field_id, IndexFunc index_func, ElementFunc element_func) + -> BitsetType { auto& schema = segment_.get_schema(); auto& field_meta = schema[field_id]; auto size_per_chunk = segment_.size_per_chunk(); auto num_chunk = upper_div(row_count_, size_per_chunk); + auto indexing_barrier = segment_.num_chunk_index(field_id); + auto data_barrier = segment_.num_chunk_data(field_id); + AssertInfo(std::max(data_barrier, indexing_barrier) == num_chunk, + "max(data_barrier, index_barrier) not equal to num_chunk"); std::deque results; - for (auto chunk_id = 0; chunk_id < num_chunk; ++chunk_id) { + // for growing segment, indexing_barrier will always less than data_barrier + // so growing segment will always execute expr plan using raw data + // if sealed segment has loaded raw data on this field, then index_barrier = 0 and data_barrier = 1 + // in this case, sealed segment execute expr plan using raw data + for (auto chunk_id = 0; chunk_id < data_barrier; ++chunk_id) { auto this_size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * size_per_chunk : size_per_chunk; BitsetType result(this_size); auto chunk = segment_.chunk_data(field_id, chunk_id); @@ -199,6 +208,20 @@ ExecExprVisitor::ExecDataRangeVisitorImpl(FieldId field_id, ElementFunc element_ AssertInfo(result.size() == this_size, "[ExecExprVisitor]Chunk result size not equal to expected size"); results.emplace_back(std::move(result)); } + + // if sealed segment has loaded scalar index for this field, then index_barrier = 1 and data_barrier = 0 + // in this case, sealed segment execute expr plan using scalar index + using Index = scalar::ScalarIndex; + for (auto chunk_id = data_barrier; chunk_id < indexing_barrier; ++chunk_id) { + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + auto this_size = const_cast(&indexing)->Count(); + BitsetType result(this_size); + for (int offset = 0; offset < this_size; ++offset) { + result[offset] = index_func(const_cast(&indexing), offset); + } + results.emplace_back(std::move(result)); + } + auto final_result = Assemble(results); AssertInfo(final_result.size() == row_count_, "[ExecExprVisitor]Final result size not equal to row count"); return final_result; @@ -278,26 +301,46 @@ ExecExprVisitor::ExecBinaryArithOpEvalRangeVisitorDispatcher(BinaryArithOpEvalRa case OpType::Equal: { switch (arith_op) { case ArithOpType::Add: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return (x + right_operand) == val; + }; auto elem_func = [val, right_operand](T x) { return ((x + right_operand) == val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } case ArithOpType::Sub: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return (x - right_operand) == val; + }; auto elem_func = [val, right_operand](T x) { return ((x - right_operand) == val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } case ArithOpType::Mul: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return (x * right_operand) == val; + }; auto elem_func = [val, right_operand](T x) { return ((x * right_operand) == val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } case ArithOpType::Div: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return (x / right_operand) == val; + }; auto elem_func = [val, right_operand](T x) { return ((x / right_operand) == val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } case ArithOpType::Mod: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return static_cast(fmod(x, right_operand)) == val; + }; auto elem_func = [val, right_operand](T x) { return (static_cast(fmod(x, right_operand)) == val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } default: { PanicInfo("unsupported arithmetic operation"); @@ -307,26 +350,46 @@ ExecExprVisitor::ExecBinaryArithOpEvalRangeVisitorDispatcher(BinaryArithOpEvalRa case OpType::NotEqual: { switch (arith_op) { case ArithOpType::Add: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return (x + right_operand) != val; + }; auto elem_func = [val, right_operand](T x) { return ((x + right_operand) != val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } case ArithOpType::Sub: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return (x - right_operand) != val; + }; auto elem_func = [val, right_operand](T x) { return ((x - right_operand) != val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } case ArithOpType::Mul: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return (x * right_operand) != val; + }; auto elem_func = [val, right_operand](T x) { return ((x * right_operand) != val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } case ArithOpType::Div: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return (x / right_operand) != val; + }; auto elem_func = [val, right_operand](T x) { return ((x / right_operand) != val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } case ArithOpType::Mod: { + auto index_func = [val, right_operand](Index* index, size_t offset) { + auto x = index->Reverse_Lookup(offset); + return static_cast(fmod(x, right_operand)) != val; + }; auto elem_func = [val, right_operand](T x) { return (static_cast(fmod(x, right_operand)) != val); }; - return ExecDataRangeVisitorImpl(expr.field_id_, elem_func); + return ExecDataRangeVisitorImpl(expr.field_id_, index_func, elem_func); } default: { PanicInfo("unsupported arithmetic operation"); @@ -351,11 +414,7 @@ ExecExprVisitor::ExecBinaryRangeVisitorDispatcher(BinaryRangeExpr& expr_raw) -> bool upper_inclusive = expr.upper_inclusive_; T val1 = expr.lower_value_; T val2 = expr.upper_value_; - // TODO: disable check? - if (val1 > val2 || (val1 == val2 && !(lower_inclusive && upper_inclusive))) { - BitsetType res(row_count_, false); - return res; - } + auto index_func = [=](Index* index) { return index->Range(val1, lower_inclusive, val2, upper_inclusive); }; if (lower_inclusive && upper_inclusive) { auto elem_func = [val1, val2](T x) { return (val1 <= x && x <= val2); }; @@ -524,48 +583,109 @@ ExecExprVisitor::ExecCompareExprDispatcher(CompareExpr& expr, Op op) -> BitsetTy auto size_per_chunk = segment_.size_per_chunk(); auto num_chunk = upper_div(row_count_, size_per_chunk); std::deque bitsets; + + // check for sealed segment, load either raw field data or index + auto left_indexing_barrier = segment_.num_chunk_index(expr.left_field_id_); + auto left_data_barrier = segment_.num_chunk_data(expr.left_field_id_); + AssertInfo(std::max(left_data_barrier, left_indexing_barrier) == num_chunk, + "max(left_data_barrier, left_indexing_barrier) not equal to num_chunk"); + + auto right_indexing_barrier = segment_.num_chunk_index(expr.right_field_id_); + auto right_data_barrier = segment_.num_chunk_data(expr.right_field_id_); + AssertInfo(std::max(right_data_barrier, right_indexing_barrier) == num_chunk, + "max(right_data_barrier, right_indexing_barrier) not equal to num_chunk"); + for (int64_t chunk_id = 0; chunk_id < num_chunk; ++chunk_id) { auto size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * size_per_chunk : size_per_chunk; - auto getChunkData = [&, chunk_id](DataType type, FieldId field_id) -> std::function { + auto getChunkData = [&, chunk_id](DataType type, FieldId field_id, + int64_t data_barrier) -> std::function { switch (type) { case DataType::BOOL: { - auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); - return [chunk_data](int i) -> const number { return chunk_data[i]; }; + if (chunk_id < data_barrier) { + auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); + return [chunk_data](int i) -> const number { return chunk_data[i]; }; + } else { + // for case, sealed segment has loaded index for scalar field instead of raw data + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); }; + } } case DataType::INT8: { - auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); - return [chunk_data](int i) -> const number { return chunk_data[i]; }; + if (chunk_id < data_barrier) { + auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); + return [chunk_data](int i) -> const number { return chunk_data[i]; }; + } else { + // for case, sealed segment has loaded index for scalar field instead of raw data + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); }; + } } case DataType::INT16: { - auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); - return [chunk_data](int i) -> const number { return chunk_data[i]; }; + if (chunk_id < data_barrier) { + auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); + return [chunk_data](int i) -> const number { return chunk_data[i]; }; + } else { + // for case, sealed segment has loaded index for scalar field instead of raw data + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); }; + } } case DataType::INT32: { - auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); - return [chunk_data](int i) -> const number { return chunk_data[i]; }; + if (chunk_id < data_barrier) { + auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); + return [chunk_data](int i) -> const number { return chunk_data[i]; }; + } else { + // for case, sealed segment has loaded index for scalar field instead of raw data + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); }; + } } case DataType::INT64: { - auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); - return [chunk_data](int i) -> const number { return chunk_data[i]; }; + if (chunk_id < data_barrier) { + auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); + return [chunk_data](int i) -> const number { return chunk_data[i]; }; + } else { + // for case, sealed segment has loaded index for scalar field instead of raw data + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); }; + } } case DataType::FLOAT: { - auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); - return [chunk_data](int i) -> const number { return chunk_data[i]; }; + if (chunk_id < data_barrier) { + auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); + return [chunk_data](int i) -> const number { return chunk_data[i]; }; + } else { + // for case, sealed segment has loaded index for scalar field instead of raw data + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); }; + } } case DataType::DOUBLE: { - auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); - return [chunk_data](int i) -> const number { return chunk_data[i]; }; + if (chunk_id < data_barrier) { + auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); + return [chunk_data](int i) -> const number { return chunk_data[i]; }; + } else { + // for case, sealed segment has loaded index for scalar field instead of raw data + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); }; + } } case DataType::VARCHAR: { - auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); - return [chunk_data](int i) -> const number { return chunk_data[i]; }; + if (chunk_id < data_barrier) { + auto chunk_data = segment_.chunk_data(field_id, chunk_id).data(); + return [chunk_data](int i) -> const number { return chunk_data[i]; }; + } else { + // for case, sealed segment has loaded index for scalar field instead of raw data + auto& indexing = segment_.chunk_scalar_index(field_id, chunk_id); + return [&indexing](int i) -> const number { return indexing.Reverse_Lookup(i); }; + } } default: PanicInfo("unsupported datatype"); } }; - auto left = getChunkData(expr.left_data_type_, expr.left_field_id_); - auto right = getChunkData(expr.right_data_type_, expr.right_field_id_); + auto left = getChunkData(expr.left_data_type_, expr.left_field_id_, left_data_barrier); + auto right = getChunkData(expr.right_data_type_, expr.right_field_id_, right_data_barrier); BitsetType bitset(size); for (int i = 0; i < size; ++i) { diff --git a/internal/core/src/segcore/SegmentGrowingImpl.h b/internal/core/src/segcore/SegmentGrowingImpl.h index 67fe99371e..88d050155a 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.h +++ b/internal/core/src/segcore/SegmentGrowingImpl.h @@ -97,6 +97,13 @@ class SegmentGrowingImpl : public SegmentGrowing { return indexing_record_.get_finished_ack(); } + // count of chunk that has raw data + int64_t + num_chunk_data(FieldId field_id) const final { + auto size = get_insert_record().ack_responder_.GetAck(); + return upper_div(size, segcore_config_.get_chunk_rows()); + } + // deprecated const knowhere::Index* chunk_index_impl(FieldId field_id, int64_t chunk_id) const final { diff --git a/internal/core/src/segcore/SegmentInterface.h b/internal/core/src/segcore/SegmentInterface.h index a7e5db7815..6f97645b2b 100644 --- a/internal/core/src/segcore/SegmentInterface.h +++ b/internal/core/src/segcore/SegmentInterface.h @@ -133,6 +133,10 @@ class SegmentInternalInterface : public SegmentInterface { virtual int64_t num_chunk_index(FieldId field_id) const = 0; + // count of chunk that has raw data + virtual int64_t + num_chunk_data(FieldId field_id) const = 0; + virtual void mask_with_timestamps(BitsetType& bitset_chunk, Timestamp timestamp) const = 0; diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index ab2eb2fa5f..17970ff3cd 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -80,6 +80,7 @@ void SegmentSealedImpl::LoadVecIndex(const LoadIndexInfo& info) { // NOTE: lock only when data is ready to avoid starvation auto field_id = FieldId(info.field_id); + auto& field_meta = schema_->operator[](field_id); auto index = std::dynamic_pointer_cast(info.index); AssertInfo(info.index_params.count("metric_type"), "Can't get metric_type in index_params"); @@ -88,8 +89,11 @@ SegmentSealedImpl::LoadVecIndex(const LoadIndexInfo& info) { AssertInfo(row_count > 0, "Index count is 0"); std::unique_lock lck(mutex_); - AssertInfo(!get_bit(vecindex_ready_bitset_, field_id), - "Can't get bitset element at " + std::to_string(field_id.get())); + // Don't allow vector raw data and index exist at the same time + AssertInfo(!get_bit(field_data_ready_bitset_, field_id), + "vector index can't be loaded when raw data exists at field " + std::to_string(field_id.get())); + AssertInfo(!get_bit(index_ready_bitset_, field_id), + "vector index has been exist at " + std::to_string(field_id.get())); if (row_count_opt_.has_value()) { AssertInfo(row_count_opt_.value() == row_count, "load data has different row count from other columns"); } else { @@ -98,7 +102,8 @@ SegmentSealedImpl::LoadVecIndex(const LoadIndexInfo& info) { AssertInfo(!vector_indexings_.is_ready(field_id), "vec index is not ready"); vector_indexings_.append_field_indexing(field_id, GetMetricType(metric_type_str), index); - set_bit(vecindex_ready_bitset_, field_id, true); + set_bit(index_ready_bitset_, field_id, true); + update_row_count(row_count); lck.unlock(); } @@ -106,22 +111,52 @@ void SegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) { // NOTE: lock only when data is ready to avoid starvation auto field_id = FieldId(info.field_id); + auto& field_meta = schema_->operator[](field_id); auto index = std::dynamic_pointer_cast(info.index); auto row_count = index->Count(); AssertInfo(row_count > 0, "Index count is 0"); std::unique_lock lck(mutex_); - + // Don't allow scalar raw data and index exist at the same time + AssertInfo(!get_bit(field_data_ready_bitset_, field_id), + "scalar index can't be loaded when raw data exists at field " + std::to_string(field_id.get())); + AssertInfo(!get_bit(index_ready_bitset_, field_id), + "scalar index has been exist at " + std::to_string(field_id.get())); if (row_count_opt_.has_value()) { AssertInfo(row_count_opt_.value() == row_count, "load data has different row count from other columns"); } else { row_count_opt_ = row_count; } - scalar_indexings_[field_id] = std::move(index); + scalar_indexings_[field_id] = index; + // reverse pk from scalar index and set pks to offset + if (schema_->get_primary_field_id() == field_id) { + AssertInfo(field_id.get() != -1, "Primary key is -1"); + AssertInfo(pk2offset_.empty(), "already exists"); + switch (field_meta.get_data_type()) { + case DataType::INT64: { + auto int64_index = std::dynamic_pointer_cast>(info.index); + for (int i = 0; i < row_count; ++i) { + pk2offset_.insert(std::make_pair(int64_index->Reverse_Lookup(i), i)); + } + break; + } + case DataType::VARCHAR: { + auto string_index = std::dynamic_pointer_cast>(info.index); + for (int i = 0; i < row_count; ++i) { + pk2offset_.insert(std::make_pair(string_index->Reverse_Lookup(i), i)); + } + break; + } + default: { + PanicInfo("unsupported primary key type"); + } + } + } - set_bit(field_data_ready_bitset_, field_id, true); + set_bit(index_ready_bitset_, field_id, true); + update_row_count(row_count); lck.unlock(); } @@ -167,12 +202,15 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) { auto data_type = field_meta.get_data_type(); AssertInfo(data_type == DataType(info.field_data->type()), "field type of load data is inconsistent with the schema"); - auto field_data = insert_record_.get_field_data_base(field_id); - AssertInfo(field_data->empty(), "already exists"); // write data under lock std::unique_lock lck(mutex_); + // Don't allow raw data and index exist at the same time + AssertInfo(!get_bit(index_ready_bitset_, field_id), "field data can't be loaded when indexing exists"); + auto field_data = insert_record_.get_field_data_base(field_id); + AssertInfo(field_data->empty(), "already exists"); + // insert data to insertRecord field_data->fill_chunk_data(size, info.field_data, field_meta); AssertInfo(field_data->num_chunk() == 1, "num chunk not equal to 1 for sealed segment"); @@ -188,15 +226,6 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) { } } - if (field_meta.is_vector()) { - AssertInfo(!vector_indexings_.is_ready(field_id), "field data can't be loaded when indexing exists"); - } else if (!scalar_indexings_.count(field_id)) { - // generate scalar index - std::unique_ptr index; - index = query::generate_scalar_index(field_data->get_span_base(0), data_type); - scalar_indexings_[field_id] = std::move(index); - } - set_bit(field_data_ready_bitset_, field_id, true); } update_row_count(info.row_count); @@ -224,9 +253,22 @@ SegmentSealedImpl::LoadDeletedRecord(const LoadDeletedRecordInfo& info) { deleted_record_.record_size_ = size; } +// internal API: support scalar index only int64_t SegmentSealedImpl::num_chunk_index(FieldId field_id) const { - return 1; + auto& field_meta = schema_->operator[](field_id); + if (field_meta.is_vector()) { + return int64_t(vector_indexings_.is_ready(field_id)); + } + + return scalar_indexings_.count(field_id); +} + +int64_t +SegmentSealedImpl::num_chunk_data(FieldId field_id) const { + auto field_data = insert_record_.get_field_data_base(field_id); + AssertInfo(field_data != nullptr, "null field data ptr"); + return field_data->num_chunk(); } int64_t @@ -253,8 +295,6 @@ SegmentSealedImpl::chunk_data_impl(FieldId field_id, int64_t chunk_id) const { const knowhere::Index* SegmentSealedImpl::chunk_index_impl(FieldId field_id, int64_t chunk_id) const { - AssertInfo(chunk_id == 0, "Chunk_id is not equal to 0"); - // TODO: support scalar index auto ptr = scalar_indexings_.at(field_id).get(); AssertInfo(ptr, "Scalar index of " + std::to_string(field_id.get()) + " is null"); return ptr; @@ -308,7 +348,7 @@ SegmentSealedImpl::vector_search(int64_t vec_count, auto& field_meta = schema_->operator[](field_id); AssertInfo(field_meta.is_vector(), "The meta type of vector field is not vector type"); - if (get_bit(vecindex_ready_bitset_, field_id)) { + if (get_bit(index_ready_bitset_, field_id)) { AssertInfo(vector_indexings_.is_ready(field_id), "vector indexes isn't ready for field " + std::to_string(field_id.get())); query::SearchOnSealed(*schema_, vector_indexings_, search_info, query_data, query_count, bitset, output, id_); @@ -383,7 +423,7 @@ SegmentSealedImpl::DropIndex(const FieldId field_id) { std::unique_lock lck(mutex_); vector_indexings_.drop_field_indexing(field_id); - set_bit(vecindex_ready_bitset_, field_id, false); + set_bit(index_ready_bitset_, field_id, false); } void @@ -396,7 +436,7 @@ SegmentSealedImpl::check_search(const query::Plan* plan) const { } auto& request_fields = plan->extra_info_opt_.value().involved_fields_; - auto field_ready_bitset = field_data_ready_bitset_ | vecindex_ready_bitset_; + auto field_ready_bitset = field_data_ready_bitset_ | index_ready_bitset_; AssertInfo(request_fields.size() == field_ready_bitset.size(), "Request fields size not equal to field ready bitset size when check search"); auto absent_fields = request_fields - field_ready_bitset; @@ -412,7 +452,7 @@ SegmentSealedImpl::SegmentSealedImpl(SchemaPtr schema, int64_t segment_id) : schema_(schema), insert_record_(*schema, MAX_ROW_COUNT), field_data_ready_bitset_(schema->size()), - vecindex_ready_bitset_(schema->size()), + index_ready_bitset_(schema->size()), scalar_indexings_(schema->size()), id_(segment_id) { } @@ -509,13 +549,26 @@ SegmentSealedImpl::fill_with_empty(FieldId field_id, int64_t count) const { std::unique_ptr SegmentSealedImpl::bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const { - if (!HasFieldData(field_id)) { + auto& field_meta = schema_->operator[](field_id); + // if count == 0, return empty data array + if (count == 0) { + return fill_with_empty(field_id, count); + } + + if (HasIndex(field_id)) { + // if field has load scalar index, reverse raw data from index + if (!datatype_is_vector(field_meta.get_data_type())) { + AssertInfo(num_chunk() == 1, "num chunk not equal to 1 for sealed segment"); + auto index = chunk_index_impl(field_id, 0); + return ReverseDataFromIndex(index, seg_offsets, count, field_meta); + } + + // TODO: knowhere support reverse data from vector index + // Now, real data will be filled in data array using chunk manager return fill_with_empty(field_id, count); } Assert(get_bit(field_data_ready_bitset_, field_id)); - - auto& field_meta = schema_->operator[](field_id); auto field_data = insert_record_.get_field_data_base(field_id); AssertInfo(field_data->num_chunk() == 1, std::string("num chunk not equal to 1 for sealed segment, num_chunk: ") + std::to_string(field_data->num_chunk())); @@ -580,7 +633,7 @@ SegmentSealedImpl::HasIndex(FieldId field_id) const { std::shared_lock lck(mutex_); AssertInfo(!SystemProperty::Instance().IsSystem(field_id), "Field id:" + std::to_string(field_id.get()) + " isn't one of system type when drop index"); - return get_bit(vecindex_ready_bitset_, field_id); + return get_bit(index_ready_bitset_, field_id); } bool diff --git a/internal/core/src/segcore/SegmentSealedImpl.h b/internal/core/src/segcore/SegmentSealedImpl.h index 9575d8b14b..f97d7034cc 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.h +++ b/internal/core/src/segcore/SegmentSealedImpl.h @@ -64,6 +64,10 @@ class SegmentSealedImpl : public SegmentSealed { int64_t num_chunk_index(FieldId field_id) const override; + // count of chunk that has raw data + int64_t + num_chunk_data(FieldId field_id) const override; + int64_t num_chunk() const override; @@ -168,7 +172,7 @@ class SegmentSealedImpl : public SegmentSealed { private: // segment loading state BitsetType field_data_ready_bitset_; - BitsetType vecindex_ready_bitset_; + BitsetType index_ready_bitset_; std::atomic system_ready_count_ = 0; // segment datas @@ -188,7 +192,6 @@ class SegmentSealedImpl : public SegmentSealed { // pks to row offset Pk2OffsetType pk2offset_; - // std::unique_ptr primary_key_index_; SchemaPtr schema_; int64_t id_; diff --git a/internal/core/src/segcore/Utils.cpp b/internal/core/src/segcore/Utils.cpp index c2e15df058..7ab3cc2a42 100644 --- a/internal/core/src/segcore/Utils.cpp +++ b/internal/core/src/segcore/Utils.cpp @@ -9,7 +9,8 @@ // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License -#include "Utils.h" +#include "segcore/Utils.h" +#include "index/ScalarIndex.h" namespace milvus::segcore { @@ -256,6 +257,115 @@ MergeDataArray(std::vector>& result_of return data_array; } +// TODO: split scalar IndexBase with knowhere::Index +std::unique_ptr +ReverseDataFromIndex(const knowhere::Index* index, + const int64_t* seg_offsets, + int64_t count, + const FieldMeta& field_meta) { + auto data_type = field_meta.get_data_type(); + auto data_array = std::make_unique(); + data_array->set_field_id(field_meta.get_id().get()); + data_array->set_type(milvus::proto::schema::DataType(field_meta.get_data_type())); + + auto scalar_array = data_array->mutable_scalars(); + switch (data_type) { + case DataType::BOOL: { + using IndexType = scalar::ScalarIndex; + auto ptr = dynamic_cast(index); + std::vector raw_data(count); + for (int64_t i = 0; i < count; ++i) { + raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]); + } + auto obj = scalar_array->mutable_bool_data(); + *(obj->mutable_data()) = {raw_data.begin(), raw_data.end()}; + break; + } + case DataType::INT8: { + using IndexType = scalar::ScalarIndex; + auto ptr = dynamic_cast(index); + std::vector raw_data(count); + for (int64_t i = 0; i < count; ++i) { + raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]); + } + auto obj = scalar_array->mutable_int_data(); + *(obj->mutable_data()) = {raw_data.begin(), raw_data.end()}; + break; + } + case DataType::INT16: { + using IndexType = scalar::ScalarIndex; + auto ptr = dynamic_cast(index); + std::vector raw_data(count); + for (int64_t i = 0; i < count; ++i) { + raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]); + } + auto obj = scalar_array->mutable_int_data(); + *(obj->mutable_data()) = {raw_data.begin(), raw_data.end()}; + break; + } + case DataType::INT32: { + using IndexType = scalar::ScalarIndex; + auto ptr = dynamic_cast(index); + std::vector raw_data(count); + for (int64_t i = 0; i < count; ++i) { + raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]); + } + auto obj = scalar_array->mutable_int_data(); + *(obj->mutable_data()) = {raw_data.begin(), raw_data.end()}; + break; + } + case DataType::INT64: { + using IndexType = scalar::ScalarIndex; + auto ptr = dynamic_cast(index); + std::vector raw_data(count); + for (int64_t i = 0; i < count; ++i) { + raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]); + } + auto obj = scalar_array->mutable_long_data(); + *(obj->mutable_data()) = {raw_data.begin(), raw_data.end()}; + break; + } + case DataType::FLOAT: { + using IndexType = scalar::ScalarIndex; + auto ptr = dynamic_cast(index); + std::vector raw_data(count); + for (int64_t i = 0; i < count; ++i) { + raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]); + } + auto obj = scalar_array->mutable_float_data(); + *(obj->mutable_data()) = {raw_data.begin(), raw_data.end()}; + break; + } + case DataType::DOUBLE: { + using IndexType = scalar::ScalarIndex; + auto ptr = dynamic_cast(index); + std::vector raw_data(count); + for (int64_t i = 0; i < count; ++i) { + raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]); + } + auto obj = scalar_array->mutable_double_data(); + *(obj->mutable_data()) = {raw_data.begin(), raw_data.end()}; + break; + } + case DataType::VARCHAR: { + using IndexType = scalar::ScalarIndex; + auto ptr = dynamic_cast(index); + std::vector raw_data(count); + for (int64_t i = 0; i < count; ++i) { + raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]); + } + auto obj = scalar_array->mutable_string_data(); + *(obj->mutable_data()) = {raw_data.begin(), raw_data.end()}; + break; + } + default: { + PanicInfo("unsupported datatype"); + } + } + + return data_array; +} + // insert_barrier means num row of insert data in a segment // del_barrier means that if the pk of the insert data is in delete record[0 : del_barrier] // then the data corresponding to this pk may be ignored when searching/querying diff --git a/internal/core/src/segcore/Utils.h b/internal/core/src/segcore/Utils.h index 6db370769d..2f628ed5a6 100644 --- a/internal/core/src/segcore/Utils.h +++ b/internal/core/src/segcore/Utils.h @@ -17,9 +17,10 @@ #include #include #include +#include "knowhere/index/Index.h" #include "common/QueryResult.h" -#include "DeletedRecord.h" -#include "InsertRecord.h" +#include "segcore/DeletedRecord.h" +#include "segcore/InsertRecord.h" namespace milvus::segcore { @@ -89,4 +90,10 @@ get_deleted_bitmap(int64_t del_barrier, const Pk2OffsetType& pk2offset, Timestamp query_timestamp); +std::unique_ptr +ReverseDataFromIndex(const knowhere::Index* index, + const int64_t* seg_offsets, + int64_t count, + const FieldMeta& field_meta); + } // namespace milvus::segcore diff --git a/internal/core/unittest/bench/bench_search.cpp b/internal/core/unittest/bench/bench_search.cpp index 0190ada888..00d66afd61 100644 --- a/internal/core/unittest/bench/bench_search.cpp +++ b/internal/core/unittest/bench/bench_search.cpp @@ -95,14 +95,14 @@ Search_Sealed(benchmark::State& state) { auto dataset_ = DataGen(schema, N); return dataset_; }(); - SealedLoader(dataset_, *segment); + SealedLoadFieldData(dataset_, *segment); auto choice = state.range(0); if (choice == 0) { // Brute Force } else if (choice == 1) { // ivf auto vec = dataset_.get_col(milvus::FieldId(100)); - auto indexing = GenIndexing(N, dim, vec.data()); + auto indexing = GenVecIndexing(N, dim, vec.data()); LoadIndexInfo info; info.index = indexing; info.field_id = (*schema)[FieldName("fakevec")].get_id().get(); diff --git a/internal/core/unittest/test_c_api.cpp b/internal/core/unittest/test_c_api.cpp index ebfe216207..c5b7464e6d 100644 --- a/internal/core/unittest/test_c_api.cpp +++ b/internal/core/unittest/test_c_api.cpp @@ -66,22 +66,6 @@ get_default_schema_config() { return conf.c_str(); } -std::vector -translate_text_plan_to_binary_plan(const char* text_plan) { - proto::plan::PlanNode plan_node; - auto ok = google::protobuf::TextFormat::ParseFromString(text_plan, &plan_node); - AssertInfo(ok, "Failed to parse"); - - std::string binary_plan; - plan_node.SerializeToString(&binary_plan); - - std::vector ret; - ret.resize(binary_plan.size()); - std::memcpy(ret.data(), binary_plan.c_str(), binary_plan.size()); - - return ret; -} - auto generate_data(int N) { std::vector raw_data; @@ -1417,7 +1401,11 @@ TEST(CApiTest, Indexing_Without_Predicate) { AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -1538,7 +1526,11 @@ TEST(CApiTest, Indexing_Expr_Without_Predicate) { AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -1677,7 +1669,11 @@ TEST(CApiTest, Indexing_With_float_Predicate_Range) { AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -1830,7 +1826,11 @@ TEST(CApiTest, Indexing_Expr_With_float_Predicate_Range) { AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -1967,7 +1967,11 @@ TEST(CApiTest, Indexing_With_float_Predicate_Term) { AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -2113,7 +2117,11 @@ TEST(CApiTest, Indexing_Expr_With_float_Predicate_Term) { AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -2252,7 +2260,11 @@ TEST(CApiTest, Indexing_With_binary_Predicate_Range) { AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -2404,7 +2416,11 @@ TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Range) { AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -2543,7 +2559,11 @@ TEST(CApiTest, Indexing_With_binary_Predicate_Term) { AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -2704,7 +2724,11 @@ TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Term) { AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -2912,7 +2936,11 @@ TEST(CApiTest, SealedSegment_search_float_Predicate_Range) { status = LoadFieldData(segment, c_ts_field_data); assert(status.error_code == Success); - auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(FieldId(100)); + sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info); + CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -3141,17 +3169,6 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) { auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); - // gen query dataset - auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); - auto result_on_index = indexing->Query(query_dataset, conf, nullptr); - auto ids = result_on_index->Get(knowhere::meta::IDS); - auto dis = result_on_index->Get(knowhere::meta::DISTANCE); - std::vector vec_ids(ids, ids + TOPK * num_queries); - std::vector vec_dis; - for (int j = 0; j < TOPK * num_queries; ++j) { - vec_dis.push_back(dis[j] * -1); - } - auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); @@ -3169,13 +3186,17 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) { AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); + // load vec index auto load_index_info = (LoadIndexInfo*)c_load_index_info; auto query_dataset2 = knowhere::GenDataset(num_queries, DIM, query_ptr); auto index = std::dynamic_pointer_cast(load_index_info->index); auto result_on_index2 = index->Query(query_dataset2, conf, nullptr); auto ids2 = result_on_index2->Get(knowhere::meta::IDS); auto dis2 = result_on_index2->Get(knowhere::meta::DISTANCE); + status = UpdateSealedSegmentIndex(segment, c_load_index_info); + assert(status.error_code == Success); + // load raw data auto c_counter_field_data = CLoadFieldDataInfo{ 101, counter_data.data(), @@ -3203,24 +3224,16 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) { status = LoadFieldData(segment, c_ts_field_data); assert(status.error_code == Success); - status = UpdateSealedSegmentIndex(segment, c_load_index_info); - assert(status.error_code == Success); - - auto counter_index = GenScalarIndexing(N, counter_col.data()); - auto counter_index_binary_set = counter_index->Serialize(conf); - CLoadIndexInfo counter_index_info = nullptr; - status = NewLoadIndexInfo(&counter_index_info); - assert(status.error_code == Success); - status = AppendFieldInfo(counter_index_info, 101, CDataType::Int64); - assert(status.error_code == Success); - std::string counter_index_type_key = "index_type"; - std::string counter_index_type_value = "sort"; - status = AppendIndexParam(counter_index_info, counter_index_type_key.c_str(), counter_index_type_value.c_str()); - assert(status.error_code == Success); - status = AppendIndex(counter_index_info, (CBinarySet)&counter_index_binary_set); - assert(status.error_code == Success); - status = UpdateSealedSegmentIndex(segment, counter_index_info); - assert(status.error_code == Success); + // gen query dataset + auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); + auto result_on_index = indexing->Query(query_dataset, conf, nullptr); + auto ids = result_on_index->Get(knowhere::meta::IDS); + auto dis = result_on_index->Get(knowhere::meta::DISTANCE); + std::vector vec_ids(ids, ids + TOPK * num_queries); + std::vector vec_dis; + for (int j = 0; j < TOPK * num_queries; ++j) { + vec_dis.push_back(dis[j] * -1); + } CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); @@ -3239,3 +3252,157 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) { DeleteCollection(collection); DeleteSegment(segment); } + +TEST(CApiTest, RetriveScalarFieldFromSealedSegmentWithIndex) { + auto schema = std::make_shared(); + auto i8_fid = schema->AddDebugField("age8", DataType::INT8); + auto i16_fid = schema->AddDebugField("age16", DataType::INT16); + auto i32_fid = schema->AddDebugField("age32", DataType::INT32); + auto i64_fid = schema->AddDebugField("age64", DataType::INT64); + auto float_fid = schema->AddDebugField("age_float", DataType::FLOAT); + auto double_fid = schema->AddDebugField("age_double", DataType::DOUBLE); + schema->set_primary_field_id(i64_fid); + + auto segment = CreateSealedSegment(schema).release(); + + int N = ROW_COUNT; + auto raw_data = DataGen(schema, N); + LoadIndexInfo load_index_info; + + // load timestamp field + FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64); + auto ts_array = CreateScalarDataArrayFrom(raw_data.timestamps_.data(), N, ts_field_meta); + auto ts_data = serialize(ts_array.get()); + auto load_info = CLoadFieldDataInfo{TimestampFieldID.get(), ts_data.data(), ts_data.size(), N}; + auto res = LoadFieldData(segment, load_info); + assert(res.error_code == Success); + auto count = GetRowCount(segment); + assert(count == N); + + // load rowid field + FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); + auto row_id_array = CreateScalarDataArrayFrom(raw_data.row_ids_.data(), N, row_id_field_meta); + auto row_id_data = serialize(row_id_array.get()); + load_info = CLoadFieldDataInfo{RowFieldID.get(), row_id_data.data(), row_id_data.size(), N}; + res = LoadFieldData(segment, load_info); + assert(res.error_code == Success); + count = GetRowCount(segment); + assert(count == N); + + // load index for int8 field + auto age8_col = raw_data.get_col(i8_fid); + GenScalarIndexing(N, age8_col.data()); + auto age8_index = milvus::scalar::CreateScalarIndexSort(); + age8_index->Build(N, age8_col.data()); + load_index_info.field_id = i8_fid.get(); + load_index_info.field_type = Int8; + load_index_info.index = std::shared_ptr>(age8_index.release()); + segment->LoadIndex(load_index_info); + + // load index for 16 field + auto age16_col = raw_data.get_col(i16_fid); + GenScalarIndexing(N, age16_col.data()); + auto age16_index = milvus::scalar::CreateScalarIndexSort(); + age16_index->Build(N, age16_col.data()); + load_index_info.field_id = i16_fid.get(); + load_index_info.field_type = Int16; + load_index_info.index = std::shared_ptr>(age16_index.release()); + segment->LoadIndex(load_index_info); + + // load index for int32 field + auto age32_col = raw_data.get_col(i32_fid); + GenScalarIndexing(N, age32_col.data()); + auto age32_index = milvus::scalar::CreateScalarIndexSort(); + age32_index->Build(N, age32_col.data()); + load_index_info.field_id = i32_fid.get(); + load_index_info.field_type = Int32; + load_index_info.index = std::shared_ptr>(age32_index.release()); + segment->LoadIndex(load_index_info); + + // load index for int64 field + auto age64_col = raw_data.get_col(i64_fid); + GenScalarIndexing(N, age64_col.data()); + auto age64_index = milvus::scalar::CreateScalarIndexSort(); + age64_index->Build(N, age64_col.data()); + load_index_info.field_id = i64_fid.get(); + load_index_info.field_type = Int64; + load_index_info.index = std::shared_ptr>(age64_index.release()); + segment->LoadIndex(load_index_info); + + // load index for float field + auto age_float_col = raw_data.get_col(float_fid); + GenScalarIndexing(N, age_float_col.data()); + auto age_float_index = milvus::scalar::CreateScalarIndexSort(); + age_float_index->Build(N, age_float_col.data()); + load_index_info.field_id = float_fid.get(); + load_index_info.field_type = Float; + load_index_info.index = std::shared_ptr>(age_float_index.release()); + segment->LoadIndex(load_index_info); + + // load index for double field + auto age_double_col = raw_data.get_col(double_fid); + GenScalarIndexing(N, age_double_col.data()); + auto age_double_index = milvus::scalar::CreateScalarIndexSort(); + age_double_index->Build(N, age_double_col.data()); + load_index_info.field_id = double_fid.get(); + load_index_info.field_type = Float; + load_index_info.index = std::shared_ptr>(age_double_index.release()); + segment->LoadIndex(load_index_info); + + // create retrieve plan + auto plan = std::make_unique(*schema); + plan->plan_node_ = std::make_unique(); + std::vector retrive_row_ids = {age64_col[0]}; + auto term_expr = std::make_unique>(i64_fid, DataType::INT64, retrive_row_ids); + plan->plan_node_->predicate_ = std::move(term_expr); + std::vector target_field_ids; + + // retrieve value + target_field_ids = {i8_fid, i16_fid, i32_fid, i64_fid, float_fid, double_fid}; + plan->field_ids_ = target_field_ids; + + CRetrieveResult retrieve_result; + res = Retrieve(segment, plan.get(), raw_data.timestamps_[N - 1], &retrieve_result); + ASSERT_EQ(res.error_code, Success); + auto query_result = std::make_unique(); + auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); + ASSERT_TRUE(suc); + ASSERT_EQ(query_result->fields_data().size(), 6); + auto fields_data = query_result->fields_data(); + for (auto iter = fields_data.begin(); iter < fields_data.end(); ++iter) { + switch (iter->type()) { + case proto::schema::DataType::Int8: { + ASSERT_EQ(iter->scalars().int_data().data(0), age8_col[0]); + break; + } + case proto::schema::DataType::Int16: { + ASSERT_EQ(iter->scalars().int_data().data(0), age16_col[0]); + break; + } + case proto::schema::DataType::Int32: { + ASSERT_EQ(iter->scalars().int_data().data(0), age32_col[0]); + break; + } + case proto::schema::DataType::Int64: { + ASSERT_EQ(iter->scalars().long_data().data(0), age64_col[0]); + break; + } + case proto::schema::DataType::Float: { + ASSERT_EQ(iter->scalars().float_data().data(0), age_float_col[0]); + break; + } + case proto::schema::DataType::Double: { + ASSERT_EQ(iter->scalars().double_data().data(0), age_double_col[0]); + break; + } + default: { + PanicInfo("not supported type"); + } + } + } + + DeleteRetrievePlan(plan.release()); + DeleteRetrieveResult(&retrieve_result); + + DeleteSegment(segment); +} diff --git a/internal/core/unittest/test_expr.cpp b/internal/core/unittest/test_expr.cpp index 8882f98093..40c37c4828 100644 --- a/internal/core/unittest/test_expr.cpp +++ b/internal/core/unittest/test_expr.cpp @@ -20,6 +20,7 @@ #include "query/generated/ExecExprVisitor.h" #include "segcore/SegmentGrowingImpl.h" #include "test_utils/DataGen.h" +#include "index/IndexFactory.h" using namespace milvus; @@ -583,6 +584,183 @@ TEST(Expr, TestCompare) { } } +TEST(Expr, TestCompareWithScalarIndex) { + using namespace milvus::query; + using namespace milvus::segcore; + std::vector>> testcases = { + {R"(LessThan)", [](int a, int64_t b) { return a < b; }}, + {R"(LessEqual)", [](int a, int64_t b) { return a <= b; }}, + {R"(GreaterThan)", [](int a, int64_t b) { return a > b; }}, + {R"(GreaterEqual)", [](int a, int64_t b) { return a >= b; }}, + {R"(Equal)", [](int a, int64_t b) { return a == b; }}, + {R"(NotEqual)", [](int a, int64_t b) { return a != b; }}, + }; + + std::string serialized_expr_plan = R"(vector_anns: < + field_id: %1% + predicates: < + compare_expr: < + left_column_info: < + field_id: %3% + data_type: %4% + > + right_column_info: < + field_id: %5% + data_type: %6% + > + op: %2% + > + > + query_info: < + topk: 10 + round_decimal: 3 + metric_type: "L2" + search_params: "{\"nprobe\": 10}" + > + placeholder_tag: "$0" + >)"; + + auto schema = std::make_shared(); + auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2); + auto i32_fid = schema->AddDebugField("age32", DataType::INT32); + auto i64_fid = schema->AddDebugField("age64", DataType::INT64); + schema->set_primary_field_id(i64_fid); + + auto seg = CreateSealedSegment(schema); + int N = 1000; + auto raw_data = DataGen(schema, N); + LoadIndexInfo load_index_info; + + // load index for int32 field + auto age32_col = raw_data.get_col(i32_fid); + age32_col[0] = 1000; + GenScalarIndexing(N, age32_col.data()); + auto age32_index = milvus::scalar::CreateScalarIndexSort(); + age32_index->Build(N, age32_col.data()); + load_index_info.field_id = i32_fid.get(); + load_index_info.field_type = Int32; + load_index_info.index = std::shared_ptr>(age32_index.release()); + seg->LoadIndex(load_index_info); + + // load index for int64 field + auto age64_col = raw_data.get_col(i64_fid); + age64_col[0] = 2000; + GenScalarIndexing(N, age64_col.data()); + auto age64_index = milvus::scalar::CreateScalarIndexSort(); + age64_index->Build(N, age64_col.data()); + load_index_info.field_id = i64_fid.get(); + load_index_info.field_type = Int64; + load_index_info.index = std::shared_ptr>(age64_index.release()); + seg->LoadIndex(load_index_info); + + ExecExprVisitor visitor(*seg, seg->get_row_count(), MAX_TIMESTAMP); + for (auto [clause, ref_func] : testcases) { + auto dsl_string = boost::format(serialized_expr_plan) % vec_fid.get() % clause % i32_fid.get() % + proto::schema::DataType_Name(int(DataType::INT32)) % i64_fid.get() % + proto::schema::DataType_Name(int(DataType::INT64)); + auto binary_plan = translate_text_plan_to_binary_plan(dsl_string.str().data()); + auto plan = CreateSearchPlanByExpr(*schema, binary_plan.data(), binary_plan.size()); + // std::cout << ShowPlanNodeVisitor().call_child(*plan->plan_node_) << std::endl; + auto final = visitor.call_child(*plan->plan_node_->predicate_.value()); + EXPECT_EQ(final.size(), N); + + for (int i = 0; i < N; ++i) { + auto ans = final[i]; + auto val1 = age32_col[i]; + auto val2 = age64_col[i]; + auto ref = ref_func(val1, val2); + ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << boost::format("[%1%, %2%]") % val1 % val2; + } + } +} + +TEST(Expr, TestCompareWithScalarIndexMaris) { + using namespace milvus::query; + using namespace milvus::segcore; + std::vector>> testcases = { + {R"(LessThan)", [](std::string a, std::string b) { return a.compare(b) < 0; }}, + {R"(LessEqual)", [](std::string a, std::string b) { return a.compare(b) <= 0; }}, + {R"(GreaterThan)", [](std::string a, std::string b) { return a.compare(b) > 0; }}, + {R"(GreaterEqual)", [](std::string a, std::string b) { return a.compare(b) >= 0; }}, + {R"(Equal)", [](std::string a, std::string b) { return a.compare(b) == 0; }}, + {R"(NotEqual)", [](std::string a, std::string b) { return a.compare(b) != 0; }}, + }; + + const char* serialized_expr_plan = R"(vector_anns: < + field_id: %1% + predicates: < + compare_expr: < + left_column_info: < + field_id: %3% + data_type: VarChar + > + right_column_info: < + field_id: %4% + data_type: VarChar + > + op: %2% + > + > + query_info: < + topk: 10 + round_decimal: 3 + metric_type: "L2" + search_params: "{\"nprobe\": 10}" + > + placeholder_tag: "$0" + >)"; + + auto schema = std::make_shared(); + auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2); + auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR); + auto str2_fid = schema->AddDebugField("string2", DataType::VARCHAR); + schema->set_primary_field_id(str1_fid); + + auto seg = CreateSealedSegment(schema); + int N = 1000; + auto raw_data = DataGen(schema, N); + LoadIndexInfo load_index_info; + + // load index for int32 field + auto str1_col = raw_data.get_col(str1_fid); + GenScalarIndexing(N, str1_col.data()); + auto str1_index = milvus::scalar::CreateScalarIndexSort(); + str1_index->Build(N, str1_col.data()); + load_index_info.field_id = str1_fid.get(); + load_index_info.field_type = VarChar; + load_index_info.index = std::shared_ptr>(str1_index.release()); + seg->LoadIndex(load_index_info); + + // load index for int64 field + auto str2_col = raw_data.get_col(str2_fid); + GenScalarIndexing(N, str2_col.data()); + auto str2_index = milvus::scalar::CreateScalarIndexSort(); + str2_index->Build(N, str2_col.data()); + load_index_info.field_id = str2_fid.get(); + load_index_info.field_type = VarChar; + load_index_info.index = std::shared_ptr>(str2_index.release()); + seg->LoadIndex(load_index_info); + + ExecExprVisitor visitor(*seg, seg->get_row_count(), MAX_TIMESTAMP); + for (auto [clause, ref_func] : testcases) { + auto dsl_string = + boost::format(serialized_expr_plan) % vec_fid.get() % clause % str1_fid.get() % str2_fid.get(); + auto binary_plan = translate_text_plan_to_binary_plan(dsl_string.str().data()); + auto plan = CreateSearchPlanByExpr(*schema, binary_plan.data(), binary_plan.size()); + // std::cout << ShowPlanNodeVisitor().call_child(*plan->plan_node_) << std::endl; + auto final = visitor.call_child(*plan->plan_node_->predicate_.value()); + EXPECT_EQ(final.size(), N); + + for (int i = 0; i < N; ++i) { + auto ans = final[i]; + auto val1 = str1_col[i]; + auto val2 = str2_col[i]; + auto ref = ref_func(val1, val2); + ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << boost::format("[%1%, %2%]") % val1 % val2; + } + } +} + TEST(Expr, TestBinaryArithOpEvalRange) { using namespace milvus::query; using namespace milvus::segcore; @@ -960,3 +1138,302 @@ TEST(Expr, TestBinaryArithOpEvalRangeExceptions) { } } } + +TEST(Expr, TestBinaryArithOpEvalRangeWithScalarSortIndex) { + using namespace milvus::query; + using namespace milvus::segcore; + std::vector, DataType>> testcases = { + // Add test cases for BinaryArithOpEvalRangeExpr EQ of various data types + {R"(arith_op: Add + right_operand: < + int64_val: 4 + > + op: Equal + value: < + int64_val: 8 + >)", + [](int8_t v) { return (v + 4) == 8; }, DataType::INT8}, + {R"(arith_op: Sub + right_operand: < + int64_val: 500 + > + op: Equal + value: < + int64_val: 1500 + >)", + [](int16_t v) { return (v - 500) == 1500; }, DataType::INT16}, + {R"(arith_op: Mul + right_operand: < + int64_val: 2 + > + op: Equal + value: < + int64_val: 4000 + >)", + [](int32_t v) { return (v * 2) == 4000; }, DataType::INT32}, + {R"(arith_op: Div + right_operand: < + int64_val: 2 + > + op: Equal + value: < + int64_val: 1000 + >)", + [](int64_t v) { return (v / 2) == 1000; }, DataType::INT64}, + {R"(arith_op: Mod + right_operand: < + int64_val: 100 + > + op: Equal + value: < + int64_val: 0 + >)", + [](int32_t v) { return (v % 100) == 0; }, DataType::INT32}, + {R"(arith_op: Add + right_operand: < + float_val: 500 + > + op: Equal + value: < + float_val: 2500 + >)", + [](float v) { return (v + 500) == 2500; }, DataType::FLOAT}, + {R"(arith_op: Add + right_operand: < + float_val: 500 + > + op: Equal + value: < + float_val: 2500 + >)", + [](double v) { return (v + 500) == 2500; }, DataType::DOUBLE}, + {R"(arith_op: Add + right_operand: < + float_val: 500 + > + op: NotEqual + value: < + float_val: 2000 + >)", + [](float v) { return (v + 500) != 2000; }, DataType::FLOAT}, + {R"(arith_op: Sub + right_operand: < + float_val: 500 + > + op: NotEqual + value: < + float_val: 2500 + >)", + [](double v) { return (v - 500) != 2000; }, DataType::DOUBLE}, + {R"(arith_op: Mul + right_operand: < + int64_val: 2 + > + op: NotEqual + value: < + int64_val: 2 + >)", + [](int8_t v) { return (v * 2) != 2; }, DataType::INT8}, + {R"(arith_op: Div + right_operand: < + int64_val: 2 + > + op: NotEqual + value: < + int64_val: 2000 + >)", + [](int16_t v) { return (v / 2) != 2000; }, DataType::INT16}, + {R"(arith_op: Mod + right_operand: < + int64_val: 100 + > + op: NotEqual + value: < + int64_val: 1 + >)", + [](int32_t v) { return (v % 100) != 1; }, DataType::INT32}, + {R"(arith_op: Add + right_operand: < + int64_val: 500 + > + op: NotEqual + value: < + int64_val: 2000 + >)", + [](int64_t v) { return (v + 500) != 2000; }, DataType::INT64}, + }; + + std::string serialized_expr_plan = R"(vector_anns: < + field_id: %1% + predicates: < + binary_arith_op_eval_range_expr: < + @@@@@ + > + > + query_info: < + topk: 10 + round_decimal: 3 + metric_type: "L2" + search_params: "{\"nprobe\": 10}" + > + placeholder_tag: "$0" + >)"; + + std::string arith_expr = R"( + column_info: < + field_id: %2% + data_type: %3% + > + @@@@)"; + + auto schema = std::make_shared(); + auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2); + auto i8_fid = schema->AddDebugField("age8", DataType::INT8); + auto i16_fid = schema->AddDebugField("age16", DataType::INT16); + auto i32_fid = schema->AddDebugField("age32", DataType::INT32); + auto i64_fid = schema->AddDebugField("age64", DataType::INT64); + auto float_fid = schema->AddDebugField("age_float", DataType::FLOAT); + auto double_fid = schema->AddDebugField("age_double", DataType::DOUBLE); + schema->set_primary_field_id(i64_fid); + + auto seg = CreateSealedSegment(schema); + int N = 1000; + auto raw_data = DataGen(schema, N); + LoadIndexInfo load_index_info; + + // load index for int8 field + auto age8_col = raw_data.get_col(i8_fid); + age8_col[0] = 4; + GenScalarIndexing(N, age8_col.data()); + auto age8_index = milvus::scalar::CreateScalarIndexSort(); + age8_index->Build(N, age8_col.data()); + load_index_info.field_id = i8_fid.get(); + load_index_info.field_type = Int8; + load_index_info.index = std::shared_ptr>(age8_index.release()); + seg->LoadIndex(load_index_info); + + // load index for 16 field + auto age16_col = raw_data.get_col(i16_fid); + age16_col[0] = 2000; + GenScalarIndexing(N, age16_col.data()); + auto age16_index = milvus::scalar::CreateScalarIndexSort(); + age16_index->Build(N, age16_col.data()); + load_index_info.field_id = i16_fid.get(); + load_index_info.field_type = Int16; + load_index_info.index = std::shared_ptr>(age16_index.release()); + seg->LoadIndex(load_index_info); + + // load index for int32 field + auto age32_col = raw_data.get_col(i32_fid); + age32_col[0] = 2000; + GenScalarIndexing(N, age32_col.data()); + auto age32_index = milvus::scalar::CreateScalarIndexSort(); + age32_index->Build(N, age32_col.data()); + load_index_info.field_id = i32_fid.get(); + load_index_info.field_type = Int32; + load_index_info.index = std::shared_ptr>(age32_index.release()); + seg->LoadIndex(load_index_info); + + // load index for int64 field + auto age64_col = raw_data.get_col(i64_fid); + age64_col[0] = 2000; + GenScalarIndexing(N, age64_col.data()); + auto age64_index = milvus::scalar::CreateScalarIndexSort(); + age64_index->Build(N, age64_col.data()); + load_index_info.field_id = i64_fid.get(); + load_index_info.field_type = Int64; + load_index_info.index = std::shared_ptr>(age64_index.release()); + seg->LoadIndex(load_index_info); + + // load index for float field + auto age_float_col = raw_data.get_col(float_fid); + age_float_col[0] = 2000; + GenScalarIndexing(N, age_float_col.data()); + auto age_float_index = milvus::scalar::CreateScalarIndexSort(); + age_float_index->Build(N, age_float_col.data()); + load_index_info.field_id = float_fid.get(); + load_index_info.field_type = Float; + load_index_info.index = std::shared_ptr>(age_float_index.release()); + seg->LoadIndex(load_index_info); + + // load index for double field + auto age_double_col = raw_data.get_col(double_fid); + age_double_col[0] = 2000; + GenScalarIndexing(N, age_double_col.data()); + auto age_double_index = milvus::scalar::CreateScalarIndexSort(); + age_double_index->Build(N, age_double_col.data()); + load_index_info.field_id = double_fid.get(); + load_index_info.field_type = Float; + load_index_info.index = std::shared_ptr>(age_double_index.release()); + seg->LoadIndex(load_index_info); + + auto seg_promote = dynamic_cast(seg.get()); + ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP); + int offset = 0; + for (auto [clause, ref_func, dtype] : testcases) { + auto loc = serialized_expr_plan.find("@@@@@"); + auto expr_plan = serialized_expr_plan; + expr_plan.replace(loc, 5, arith_expr); + loc = expr_plan.find("@@@@"); + expr_plan.replace(loc, 4, clause); + boost::format expr; + if (dtype == DataType::INT8) { + expr = boost::format(expr_plan) % vec_fid.get() % i8_fid.get() % + proto::schema::DataType_Name(int(DataType::INT8)); + } else if (dtype == DataType::INT16) { + expr = boost::format(expr_plan) % vec_fid.get() % i16_fid.get() % + proto::schema::DataType_Name(int(DataType::INT16)); + } else if (dtype == DataType::INT32) { + expr = boost::format(expr_plan) % vec_fid.get() % i32_fid.get() % + proto::schema::DataType_Name(int(DataType::INT32)); + } else if (dtype == DataType::INT64) { + expr = boost::format(expr_plan) % vec_fid.get() % i64_fid.get() % + proto::schema::DataType_Name(int(DataType::INT64)); + } else if (dtype == DataType::FLOAT) { + expr = boost::format(expr_plan) % vec_fid.get() % float_fid.get() % + proto::schema::DataType_Name(int(DataType::FLOAT)); + } else if (dtype == DataType::DOUBLE) { + expr = boost::format(expr_plan) % vec_fid.get() % double_fid.get() % + proto::schema::DataType_Name(int(DataType::DOUBLE)); + } else { + ASSERT_TRUE(false) << "No test case defined for this data type"; + } + + auto binary_plan = translate_text_plan_to_binary_plan(expr.str().data()); + auto plan = CreateSearchPlanByExpr(*schema, binary_plan.data(), binary_plan.size()); + + auto final = visitor.call_child(*plan->plan_node_->predicate_.value()); + EXPECT_EQ(final.size(), N); + + for (int i = 0; i < N; ++i) { + auto ans = final[i]; + if (dtype == DataType::INT8) { + auto val = age8_col[i]; + auto ref = ref_func(val); + ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val; + } else if (dtype == DataType::INT16) { + auto val = age16_col[i]; + auto ref = ref_func(val); + ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val; + } else if (dtype == DataType::INT32) { + auto val = age32_col[i]; + auto ref = ref_func(val); + ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val; + } else if (dtype == DataType::INT64) { + auto val = age64_col[i]; + auto ref = ref_func(val); + ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val; + } else if (dtype == DataType::FLOAT) { + auto val = age_float_col[i]; + auto ref = ref_func(val); + ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val; + } else if (dtype == DataType::DOUBLE) { + auto val = age_double_col[i]; + auto ref = ref_func(val); + ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val; + } else { + ASSERT_TRUE(false) << "No test case defined for this data type"; + } + } + } +} diff --git a/internal/core/unittest/test_query.cpp b/internal/core/unittest/test_query.cpp index bce2280e53..1e6dba0e8a 100644 --- a/internal/core/unittest/test_query.cpp +++ b/internal/core/unittest/test_query.cpp @@ -660,8 +660,8 @@ TEST(Query, FillSegment) { }()); segments.emplace_back([&] { auto segment = CreateSealedSegment(schema); - SealedLoader(dataset, *segment); - // auto indexing = GenIndexing(N, dim, std_vfloat_vec.data()); + SealedLoadFieldData(dataset, *segment); + // auto indexing = GenVecIndexing(N, dim, std_vfloat_vec.data()); // LoadIndexInfo info; // auto field_offset = schema->get_offset(FieldName("fakevec")); diff --git a/internal/core/unittest/test_retrieve.cpp b/internal/core/unittest/test_retrieve.cpp index fdb45f3820..a8b367929a 100644 --- a/internal/core/unittest/test_retrieve.cpp +++ b/internal/core/unittest/test_retrieve.cpp @@ -57,7 +57,7 @@ TEST(Retrieve, AutoID) { auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); - SealedLoader(dataset, *segment); + SealedLoadFieldData(dataset, *segment); auto i64_col = dataset.get_col(fid_64); auto plan = std::make_unique(*schema); @@ -107,7 +107,7 @@ TEST(Retrieve, AutoID2) { auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); - SealedLoader(dataset, *segment); + SealedLoadFieldData(dataset, *segment); auto i64_col = dataset.get_col(fid_64); auto plan = std::make_unique(*schema); @@ -153,7 +153,7 @@ TEST(Retrieve, NotExist) { auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); - SealedLoader(dataset, *segment); + SealedLoadFieldData(dataset, *segment); auto i64_col = dataset.get_col(fid_64); auto plan = std::make_unique(*schema); @@ -236,7 +236,7 @@ TEST(Retrieve, LargeTimestamp) { uint64_t ts_offset = 100; auto dataset = DataGen(schema, N, 42, ts_offset + 1); auto segment = CreateSealedSegment(schema); - SealedLoader(dataset, *segment); + SealedLoadFieldData(dataset, *segment); auto i64_col = dataset.get_col(fid_64); auto plan = std::make_unique(*schema); @@ -285,7 +285,7 @@ TEST(Retrieve, Delete) { auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); - SealedLoader(dataset, *segment); + SealedLoadFieldData(dataset, *segment); auto i64_col = dataset.get_col(fid_64); auto plan = std::make_unique(*schema); diff --git a/internal/core/unittest/test_scalar_index.cpp b/internal/core/unittest/test_scalar_index.cpp index f3436107a4..19303ad569 100644 --- a/internal/core/unittest/test_scalar_index.cpp +++ b/internal/core/unittest/test_scalar_index.cpp @@ -88,6 +88,18 @@ TYPED_TEST_P(TypedScalarIndexTest, NotIn) { } } +TYPED_TEST_P(TypedScalarIndexTest, Reverse) { + using T = TypeParam; + auto dtype = milvus::GetDType(); + auto index_types = GetIndexTypes(); + for (const auto& index_type : index_types) { + auto index = milvus::scalar::IndexFactory::GetInstance().CreateIndex(index_type); + auto arr = GenArr(nb); + index->Build(nb, arr.data()); + assert_reverse(index, arr); + } +} + TYPED_TEST_P(TypedScalarIndexTest, Range) { using T = TypeParam; auto dtype = milvus::GetDType(); @@ -123,6 +135,6 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) { // TODO: it's easy to overflow for int8_t. Design more reasonable ut. using ScalarT = ::testing::Types; -REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTest, Dummy, Constructor, Count, In, NotIn, Range, Codec); +REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTest, Dummy, Constructor, Count, In, NotIn, Range, Codec, Reverse); INSTANTIATE_TYPED_TEST_CASE_P(ArithmeticCheck, TypedScalarIndexTest, ScalarT); diff --git a/internal/core/unittest/test_sealed.cpp b/internal/core/unittest/test_sealed.cpp index 1be652ad2c..788e861628 100644 --- a/internal/core/unittest/test_sealed.cpp +++ b/internal/core/unittest/test_sealed.cpp @@ -110,7 +110,11 @@ TEST(Sealed, without_predicate) { load_info.index = indexing; load_info.index_params["metric_type"] = "L2"; - auto sealed_segment = SealedCreator(schema, dataset, load_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(fake_id); + sealed_segment->LoadIndex(load_info); + sr = sealed_segment->Search(plan.get(), *ph_group, time); auto post_result = SearchResultToJson(*sr); @@ -202,7 +206,11 @@ TEST(Sealed, with_predicate) { load_info.index = indexing; load_info.index_params["metric_type"] = "L2"; - auto sealed_segment = SealedCreator(schema, dataset, load_info); + // load index for vec field, load raw data for scalar filed + auto sealed_segment = SealedCreator(schema, dataset); + sealed_segment->DropFieldData(fake_id); + sealed_segment->LoadIndex(load_info); + sr = sealed_segment->Search(plan.get(), *ph_group, time); for (int i = 0; i < num_queries; ++i) { @@ -229,7 +237,7 @@ TEST(Sealed, LoadFieldData) { auto fakevec = dataset.get_col(fakevec_id); - auto indexing = GenIndexing(N, dim, fakevec.data()); + auto indexing = GenVecIndexing(N, dim, fakevec.data()); auto segment = CreateSealedSegment(schema); std::string dsl = R"({ @@ -268,7 +276,7 @@ TEST(Sealed, LoadFieldData) { ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time)); - SealedLoader(dataset, *segment); + SealedLoadFieldData(dataset, *segment); segment->DropFieldData(nothing_id); segment->Search(plan.get(), *ph_group, time); @@ -282,8 +290,8 @@ TEST(Sealed, LoadFieldData) { segment->LoadIndex(vec_info); ASSERT_EQ(segment->num_chunk(), 1); - ASSERT_EQ(segment->num_chunk_index(double_id), 1); - ASSERT_EQ(segment->num_chunk_index(str_id), 1); + ASSERT_EQ(segment->num_chunk_index(double_id), 0); + ASSERT_EQ(segment->num_chunk_index(str_id), 0); auto chunk_span1 = segment->chunk_data(counter_id, 0); auto chunk_span2 = segment->chunk_data(double_id, 0); auto chunk_span3 = segment->chunk_data(str_id, 0); @@ -349,7 +357,7 @@ TEST(Sealed, LoadScalarIndex) { auto fakevec = dataset.get_col(fakevec_id); - auto indexing = GenIndexing(N, dim, fakevec.data()); + auto indexing = GenVecIndexing(N, dim, fakevec.data()); auto segment = CreateSealedSegment(schema); std::string dsl = R"({ @@ -386,7 +394,21 @@ TEST(Sealed, LoadScalarIndex) { auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); - SealedLoader(dataset, *segment); + LoadFieldDataInfo row_id_info; + FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); + auto array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta); + row_id_info.field_data = array.release(); + row_id_info.row_count = dataset.row_ids_.size(); + row_id_info.field_id = RowFieldID.get(); // field id for RowId + segment->LoadFieldData(row_id_info); + + LoadFieldDataInfo ts_info; + FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64); + array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta); + ts_info.field_data = array.release(); + ts_info.row_count = dataset.timestamps_.size(); + ts_info.field_id = TimestampFieldID.get(); + segment->LoadFieldData(ts_info); LoadIndexInfo vec_info; vec_info.field_id = fakevec_id.get(); @@ -477,7 +499,7 @@ TEST(Sealed, Delete) { ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time)); - SealedLoader(dataset, *segment); + SealedLoadFieldData(dataset, *segment); int64_t row_count = 5; std::vector pks{1, 2, 3, 4, 5}; diff --git a/internal/core/unittest/test_string_index.cpp b/internal/core/unittest/test_string_index.cpp index f24483fd88..b68bbf552c 100644 --- a/internal/core/unittest/test_string_index.cpp +++ b/internal/core/unittest/test_string_index.cpp @@ -87,10 +87,52 @@ TEST_F(StringIndexMarisaTest, NotIn) { TEST_F(StringIndexMarisaTest, Range) { auto index = milvus::scalar::CreateStringIndexMarisa(); + std::vector strings(nb); + for (int i = 0; i < nb; ++i) { + strings[i] = std::to_string(std::rand() % 10); + } + *str_arr.mutable_data() = {strings.begin(), strings.end()}; + str_ds = GenDsFromPB(str_arr); index->BuildWithDataset(str_ds); - ASSERT_ANY_THROW(index->Range("not important", milvus::OpType::LessEqual)); - ASSERT_ANY_THROW(index->Range("not important", true, "not important", true)); + { + auto bitset = index->Range("0", milvus::OpType::GreaterEqual); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } + + { + auto bitset = index->Range("90", milvus::OpType::LessThan); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } + + { + auto bitset = index->Range("9", milvus::OpType::LessEqual); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } + + { + auto bitset = index->Range("0", true, "9", true); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } + + { + auto bitset = index->Range("0", true, "90", false); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } +} + +TEST_F(StringIndexMarisaTest, Reverse) { + auto index_types = GetIndexTypes(); + for (const auto& index_type : index_types) { + auto index = milvus::scalar::IndexFactory::GetInstance().CreateIndex(index_type); + index->BuildWithDataset(str_ds); + assert_reverse(index, strs); + } } TEST_F(StringIndexMarisaTest, PrefixMatch) { @@ -126,18 +168,21 @@ TEST_F(StringIndexMarisaTest, Query) { { auto ds = std::make_shared(); ds->Set(milvus::scalar::OPERATOR_TYPE, milvus::OpType::GreaterEqual); - ds->Set(milvus::scalar::RANGE_VALUE, "range"); - ASSERT_ANY_THROW(index->Query(ds)); + ds->Set(milvus::scalar::RANGE_VALUE, "0"); + auto bitset = index->Query(ds); + ASSERT_EQ(bitset->size(), strs.size()); + ASSERT_EQ(bitset->count(), strs.size()); } { auto ds = std::make_shared(); ds->Set(milvus::scalar::OPERATOR_TYPE, milvus::OpType::Range); - ds->Set(milvus::scalar::LOWER_BOUND_VALUE, "range"); + ds->Set(milvus::scalar::LOWER_BOUND_VALUE, "0"); ds->Set(milvus::scalar::UPPER_BOUND_VALUE, "range"); ds->Set(milvus::scalar::LOWER_BOUND_INCLUSIVE, true); ds->Set(milvus::scalar::UPPER_BOUND_INCLUSIVE, true); - ASSERT_ANY_THROW(index->Query(ds)); + auto bitset = index->Query(ds); + ASSERT_TRUE(bitset->any()); } { @@ -154,6 +199,12 @@ TEST_F(StringIndexMarisaTest, Query) { TEST_F(StringIndexMarisaTest, Codec) { auto index = milvus::scalar::CreateStringIndexMarisa(); + std::vector strings(nb); + for (int i = 0; i < nb; ++i) { + strings[i] = std::to_string(std::rand() % 10); + } + *str_arr.mutable_data() = {strings.begin(), strings.end()}; + str_ds = GenDsFromPB(str_arr); index->BuildWithDataset(str_ds); auto copy_index = milvus::scalar::CreateStringIndexMarisa(); @@ -164,27 +215,52 @@ TEST_F(StringIndexMarisaTest, Codec) { } { - auto bitset = copy_index->In(strs.size(), strs.data()); - ASSERT_EQ(bitset->size(), strs.size()); + auto bitset = copy_index->In(nb, strings.data()); + ASSERT_EQ(bitset->size(), nb); ASSERT_TRUE(bitset->any()); } { - auto bitset = copy_index->NotIn(strs.size(), strs.data()); - ASSERT_EQ(bitset->size(), strs.size()); + auto bitset = copy_index->NotIn(nb, strings.data()); + ASSERT_EQ(bitset->size(), nb); ASSERT_TRUE(bitset->none()); } { - ASSERT_ANY_THROW(copy_index->Range("not important", milvus::OpType::LessEqual)); - ASSERT_ANY_THROW(copy_index->Range("not important", true, "not important", true)); + auto bitset = copy_index->Range("0", milvus::OpType::GreaterEqual); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); } { - for (size_t i = 0; i < strs.size(); i++) { - auto str = strs[i]; + auto bitset = copy_index->Range("90", milvus::OpType::LessThan); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } + + { + auto bitset = copy_index->Range("9", milvus::OpType::LessEqual); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } + + { + auto bitset = copy_index->Range("0", true, "9", true); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } + + { + auto bitset = copy_index->Range("0", true, "90", false); + ASSERT_EQ(bitset->size(), nb); + ASSERT_EQ(bitset->count(), nb); + } + + { + for (size_t i = 0; i < nb; i++) { + auto str = strings[i]; auto bitset = copy_index->PrefixMatch(str); - ASSERT_EQ(bitset->size(), strs.size()); + ASSERT_EQ(bitset->size(), nb); ASSERT_TRUE(bitset->test(i)); } } diff --git a/internal/core/unittest/test_utils/AssertUtils.h b/internal/core/unittest/test_utils/AssertUtils.h index 603595b82a..8b7dd2a8c9 100644 --- a/internal/core/unittest/test_utils/AssertUtils.h +++ b/internal/core/unittest/test_utils/AssertUtils.h @@ -18,6 +18,20 @@ using milvus::scalar::ScalarIndexPtr; namespace { + +bool +compare_float(float x, float y, float epsilon = 0.000001f) { + if (fabs(x - y) < epsilon) + return true; + return false; +} +bool +compare_double(double x, double y, double epsilon = 0.000001f) { + if (fabs(x - y) < epsilon) + return true; + return false; +} + template inline void assert_in(const ScalarIndexPtr& index, const std::vector& arr) { @@ -74,6 +88,38 @@ assert_range(const ScalarIndexPtr& index, const std::vector& arr) { ASSERT_TRUE(bitset5->any()); } +template +inline void +assert_reverse(const ScalarIndexPtr& index, const std::vector& arr) { + for (size_t offset = 0; offset < arr.size(); ++offset) { + ASSERT_EQ(index->Reverse_Lookup(offset), arr[offset]); + } +} + +template <> +inline void +assert_reverse(const ScalarIndexPtr& index, const std::vector& arr) { + for (size_t offset = 0; offset < arr.size(); ++offset) { + ASSERT_TRUE(compare_float(index->Reverse_Lookup(offset), arr[offset])); + } +} + +template <> +inline void +assert_reverse(const ScalarIndexPtr& index, const std::vector& arr) { + for (size_t offset = 0; offset < arr.size(); ++offset) { + ASSERT_TRUE(compare_double(index->Reverse_Lookup(offset), arr[offset])); + } +} + +template <> +inline void +assert_reverse(const ScalarIndexPtr& index, const std::vector& arr) { + for (size_t offset = 0; offset < arr.size(); ++offset) { + ASSERT_TRUE(arr[offset].compare(index->Reverse_Lookup(offset)) == 0); + } +} + template <> inline void assert_in(const ScalarIndexPtr& index, const std::vector& arr) { diff --git a/internal/core/unittest/test_utils/DataGen.h b/internal/core/unittest/test_utils/DataGen.h index e507e157a7..cdde383930 100644 --- a/internal/core/unittest/test_utils/DataGen.h +++ b/internal/core/unittest/test_utils/DataGen.h @@ -15,6 +15,7 @@ #include #include #include +#include "google/protobuf/text_format.h" #include #include #include @@ -97,8 +98,10 @@ struct GeneratedData { break; } case DataType::VARCHAR: { - auto src_data = reinterpret_cast(target_field_data.scalars().string_data().data().data()); - std::copy_n(src_data, raw_->num_rows(), ret.data()); + auto ret_data = reinterpret_cast(ret.data()); + auto src_data = target_field_data.scalars().string_data().data(); + std::copy(src_data.begin(), src_data.end(), ret_data); + break; } default: { @@ -354,8 +357,7 @@ SearchResultToJson(const SearchResult& sr) { }; inline void -SealedLoader(const GeneratedData& dataset, SegmentSealed& seg) { - // TODO +SealedLoadFieldData(const GeneratedData& dataset, SegmentSealed& seg) { auto row_count = dataset.row_ids_.size(); { LoadFieldDataInfo info; @@ -385,15 +387,14 @@ SealedLoader(const GeneratedData& dataset, SegmentSealed& seg) { } inline std::unique_ptr -SealedCreator(SchemaPtr schema, const GeneratedData& dataset, const LoadIndexInfo& index_info) { +SealedCreator(SchemaPtr schema, const GeneratedData& dataset) { auto segment = CreateSealedSegment(schema); - SealedLoader(dataset, *segment); - segment->LoadIndex(index_info); + SealedLoadFieldData(dataset, *segment); return segment; } inline knowhere::VecIndexPtr -GenIndexing(int64_t N, int64_t dim, const float* vec) { +GenVecIndexing(int64_t N, int64_t dim, const float* vec) { // {knowhere::IndexParams::nprobe, 10}, auto conf = knowhere::Config{{knowhere::meta::DIM, dim}, {knowhere::IndexParams::nlist, 1024}, @@ -420,4 +421,20 @@ GenScalarIndexing(int64_t N, const T* data) { } } +inline std::vector +translate_text_plan_to_binary_plan(const char* text_plan) { + proto::plan::PlanNode plan_node; + auto ok = google::protobuf::TextFormat::ParseFromString(text_plan, &plan_node); + AssertInfo(ok, "Failed to parse"); + + std::string binary_plan; + plan_node.SerializeToString(&binary_plan); + + std::vector ret; + ret.resize(binary_plan.size()); + std::memcpy(ret.data(), binary_plan.c_str(), binary_plan.size()); + + return ret; +} + } // namespace milvus::segcore diff --git a/internal/querynode/segment.go b/internal/querynode/segment.go index 718e862f73..1304793005 100644 --- a/internal/querynode/segment.go +++ b/internal/querynode/segment.go @@ -828,7 +828,7 @@ func (s *Segment) segmentLoadIndexData(bytesIndex [][]byte, indexInfo *querypb.F return err } - log.Info("updateSegmentIndex done", zap.Int64("segmentID", s.ID())) + log.Info("updateSegmentIndex done", zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", indexInfo.FieldID)) return nil } diff --git a/internal/querynode/segment_loader.go b/internal/querynode/segment_loader.go index 411cc734ea..ddb60b3a6a 100644 --- a/internal/querynode/segment_loader.go +++ b/internal/querynode/segment_loader.go @@ -253,12 +253,6 @@ func (loader *segmentLoader) loadSegmentInternal(segment *Segment, indexInfo: indexInfo, } indexedFieldInfos[fieldID] = fieldInfo - - if fieldBinlog.FieldID == pkFieldID { - // pk field data should always be loaded. - // segCore need a map (pk data -> offset) - fieldBinlogs = append(fieldBinlogs, fieldBinlog) - } } else { fieldBinlogs = append(fieldBinlogs, fieldBinlog) } @@ -402,7 +396,7 @@ func (loader *segmentLoader) loadIndexedFieldData(segment *Segment, vecFieldInfo if err != nil { return err } - log.Debug("load vector field's index data done", zap.Int64("segmentID", segment.ID()), zap.Int64("fieldID", fieldID)) + log.Debug("load field's index data done", zap.Int64("segmentID", segment.ID()), zap.Int64("fieldID", fieldID)) } segment.setIndexedFieldInfo(fieldID, fieldInfo) }