fix: try to get span raw data for variable length data type(#43544) (#43703)

related: #43544
pr: https://github.com/milvus-io/milvus/pull/43705

Signed-off-by: MrPresent-Han <chun.han@gmail.com>
Co-authored-by: MrPresent-Han <chun.han@gmail.com>
This commit is contained in:
Chun Han 2025-08-04 11:25:39 +08:00 committed by GitHub
parent 1a2871b628
commit f033294dc1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 162 additions and 38 deletions

View File

@ -251,6 +251,20 @@ class ArrayChunk : public Chunk {
offsets_ptr);
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
ViewsByOffsets(const FixedVector<int32_t>& offsets) {
std::vector<ArrayView> views;
FixedVector<bool> valid_res;
size_t size = offsets.size();
views.reserve(size);
valid_res.reserve(size);
for (auto i = 0; i < size; ++i) {
views.emplace_back(View(offsets[i]));
valid_res.emplace_back(isValid(offsets[i]));
}
return {std::move(views), std::move(valid_res)};
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
Views(std::optional<std::pair<int64_t, int64_t>> offset_len =
std::nullopt) const {

View File

@ -500,7 +500,8 @@ class SegmentExpr : public Expr {
if (segment_->type() == SegmentType::Sealed) {
if (segment_->is_chunked()) {
if constexpr (std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Json>) {
std::is_same_v<T, Json> ||
std::is_same_v<T, ArrayView>) {
for (size_t i = 0; i < input->size(); ++i) {
int64_t offset = (*input)[i];
auto [chunk_id, chunk_offset] =
@ -557,7 +558,8 @@ class SegmentExpr : public Expr {
return input->size();
} else {
if constexpr (std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Json>) {
std::is_same_v<T, Json> ||
std::is_same_v<T, ArrayView>) {
return ProcessDataByOffsetsForSealedSeg<T>(
func, skip_func, input, res, valid_res, values...);
}
@ -619,7 +621,6 @@ class SegmentExpr : public Expr {
TargetBitmapView valid_res,
ValTypes... values) {
int64_t processed_size = 0;
if constexpr (std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Json>) {
if (segment_->type() == SegmentType::Sealed) {
@ -742,7 +743,8 @@ class SegmentExpr : public Expr {
} else {
const bool* valid_data;
if constexpr (std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Json>) {
std::is_same_v<T, Json> ||
std::is_same_v<T, ArrayView>) {
if (segment_->type() == SegmentType::Sealed) {
auto batch_views = segment_->get_batch_views<T>(
field_id_, i, data_pos, size);

View File

@ -25,7 +25,6 @@ void
PhyJsonContainsFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
auto input = context.get_offset_input();
SetHasOffsetInput((input != nullptr));
if (expr_->vals_.empty()) {
auto real_batch_size = has_offset_input_
? context.get_offset_input()->size()

View File

@ -168,12 +168,19 @@ class ChunkedColumnBase : public ColumnBase {
}
virtual std::pair<std::vector<std::string_view>, FixedVector<bool>>
ViewsByOffsets(int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
StringViewsByOffsets(int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
PanicInfo(ErrorCode::Unsupported,
"viewsbyoffsets only supported for VariableColumn");
}
virtual std::pair<std::vector<ArrayView>, FixedVector<bool>>
ArrayViewsByOffsets(int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
PanicInfo(ErrorCode::Unsupported,
"viewsbyoffsets only supported for ArrayColumn");
}
std::pair<size_t, size_t>
GetChunkIDByOffset(int64_t offset) const {
AssertInfo(offset < num_rows_,
@ -355,8 +362,8 @@ class ChunkedVariableColumn : public ChunkedColumnBase {
}
std::pair<std::vector<std::string_view>, FixedVector<bool>>
ViewsByOffsets(int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override {
StringViewsByOffsets(int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override {
return std::static_pointer_cast<StringChunk>(chunks_[chunk_id])
->ViewsByOffsets(offsets);
}
@ -444,5 +451,12 @@ class ChunkedArrayColumn : public ChunkedColumnBase {
return std::dynamic_pointer_cast<ArrayChunk>(chunks_[chunk_id])
->Views(offset_len);
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
ArrayViewsByOffsets(int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override {
return std::dynamic_pointer_cast<ArrayChunk>(chunks_[chunk_id])
->ViewsByOffsets(offsets);
}
};
} // namespace milvus

View File

@ -331,11 +331,17 @@ class SingleChunkColumnBase : public ColumnBase {
}
virtual std::pair<std::vector<std::string_view>, FixedVector<bool>>
ViewsByOffsets(const FixedVector<int32_t>& offsets) const {
StringViewsByOffsets(const FixedVector<int32_t>& offsets) const {
PanicInfo(ErrorCode::Unsupported,
"viewsbyoffsets only supported for VariableColumn");
}
virtual std::pair<std::vector<ArrayView>, FixedVector<bool>>
ArrayViewsByOffsets(const FixedVector<int32_t>& offsets) const {
PanicInfo(ErrorCode::Unsupported,
"viewsbyoffsets only supported for ArrayColumn");
}
virtual std::string_view
RawAt(const size_t i) const {
PanicInfo(ErrorCode::Unsupported,
@ -719,7 +725,7 @@ class SingleChunkVariableColumn : public SingleChunkColumnBase {
}
std::pair<std::vector<std::string_view>, FixedVector<bool>>
ViewsByOffsets(const FixedVector<int32_t>& offsets) const {
StringViewsByOffsets(const FixedVector<int32_t>& offsets) const {
std::vector<std::string_view> res;
FixedVector<bool> valid;
res.reserve(offsets.size());
@ -728,7 +734,7 @@ class SingleChunkVariableColumn : public SingleChunkColumnBase {
res.emplace_back(RawAt(offset));
valid.emplace_back(IsValid(offset));
}
return {res, valid};
return {std::move(res), std::move(valid)};
}
[[nodiscard]] std::vector<ViewType>
@ -973,6 +979,19 @@ class SingleChunkArrayColumn : public SingleChunkColumnBase {
return {Views(), valid_data_};
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
ArrayViewsByOffsets(const FixedVector<int32_t>& offsets) const override {
std::vector<ArrayView> views;
FixedVector<bool> valid;
views.reserve(offsets.size());
valid.reserve(offsets.size());
for (auto offset : offsets) {
views.emplace_back(this->operator[](offset));
valid.emplace_back(IsValid(offset));
}
return {std::move(views), std::move(valid)};
}
protected:
void
ConstructViews() {

View File

@ -774,7 +774,7 @@ ChunkedSegmentSealedImpl::chunk_string_view_impl(
}
std::pair<std::vector<std::string_view>, FixedVector<bool>>
ChunkedSegmentSealedImpl::chunk_view_by_offsets(
ChunkedSegmentSealedImpl::chunk_string_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
@ -783,10 +783,28 @@ ChunkedSegmentSealedImpl::chunk_view_by_offsets(
"Can't get bitset element at " + std::to_string(field_id.get()));
if (auto it = fields_.find(field_id); it != fields_.end()) {
auto& field_data = it->second;
return field_data->ViewsByOffsets(chunk_id, offsets);
return field_data->StringViewsByOffsets(chunk_id, offsets);
}
PanicInfo(ErrorCode::UnexpectedError,
"chunk_view_by_offsets only used for variable column field ");
PanicInfo(
ErrorCode::UnexpectedError,
"chunk_string_views_by_offsets only used for variable column field ");
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
ChunkedSegmentSealedImpl::chunk_array_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
std::shared_lock lck(mutex_);
AssertInfo(get_bit(field_data_ready_bitset_, field_id),
"Can't get bitset element at " + std::to_string(field_id.get()));
if (auto it = fields_.find(field_id); it != fields_.end()) {
auto& field_data = it->second;
return field_data->ArrayViewsByOffsets(chunk_id, offsets);
}
PanicInfo(
ErrorCode::UnexpectedError,
"chunk_array_views_by_offsets only used for variable column field ");
}
const index::IndexBase*

View File

@ -246,9 +246,16 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
std::optional<std::pair<int64_t, int64_t>> offset_len) const override;
std::pair<std::vector<std::string_view>, FixedVector<bool>>
chunk_view_by_offsets(FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
chunk_string_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
std::pair<std::vector<ArrayView>, FixedVector<bool>>
chunk_array_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
std::pair<BufferView, FixedVector<bool>>
get_chunk_buffer(FieldId field_id,

View File

@ -419,12 +419,23 @@ SegmentGrowingImpl::chunk_array_view_impl(
}
std::pair<std::vector<std::string_view>, FixedVector<bool>>
SegmentGrowingImpl::chunk_view_by_offsets(
SegmentGrowingImpl::chunk_string_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
PanicInfo(ErrorCode::NotImplemented,
"chunk view by offsets not implemented for growing segment");
PanicInfo(
ErrorCode::NotImplemented,
"chunk string views by offsets not implemented for growing segment");
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
SegmentGrowingImpl::chunk_array_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
PanicInfo(
ErrorCode::NotImplemented,
"chunk array views by offsets not implemented for growing segment");
}
int64_t

View File

@ -376,9 +376,16 @@ class SegmentGrowingImpl : public SegmentGrowing {
std::optional<std::pair<int64_t, int64_t>> offset_len) const override;
std::pair<std::vector<std::string_view>, FixedVector<bool>>
chunk_view_by_offsets(FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
chunk_string_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
std::pair<std::vector<ArrayView>, FixedVector<bool>>
chunk_array_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
std::pair<BufferView, FixedVector<bool>>
get_chunk_buffer(FieldId field_id,

View File

@ -212,16 +212,19 @@ class SegmentInternalInterface : public SegmentInterface {
PanicInfo(ErrorCode::Unsupported,
"get chunk views not supported for growing segment");
}
auto chunk_view = chunk_view_by_offsets(field_id, chunk_id, offsets);
if constexpr (std::is_same_v<ViewType, std::string_view>) {
return chunk_view;
} else {
return chunk_string_views_by_offsets(field_id, chunk_id, offsets);
} else if constexpr (std::is_same_v<ViewType, Json>) {
auto chunk_view =
chunk_string_views_by_offsets(field_id, chunk_id, offsets);
std::vector<ViewType> res;
res.reserve(chunk_view.first.size());
for (const auto& view : chunk_view.first) {
res.emplace_back(view);
}
return {res, chunk_view.second};
} else if constexpr (std::is_same_v<ViewType, ArrayView>) {
return chunk_array_views_by_offsets(field_id, chunk_id, offsets);
}
}
@ -476,9 +479,15 @@ class SegmentInternalInterface : public SegmentInterface {
int64_t length) const = 0;
virtual std::pair<std::vector<std::string_view>, FixedVector<bool>>
chunk_view_by_offsets(FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const = 0;
chunk_string_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const = 0;
virtual std::pair<std::vector<ArrayView>, FixedVector<bool>>
chunk_array_views_by_offsets(FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const = 0;
// internal API: return chunk_index in span, support scalar index only
virtual const index::IndexBase*

View File

@ -778,7 +778,7 @@ SegmentSealedImpl::chunk_array_view_impl(
}
std::pair<std::vector<std::string_view>, FixedVector<bool>>
SegmentSealedImpl::chunk_view_by_offsets(
SegmentSealedImpl::chunk_string_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
@ -787,10 +787,27 @@ SegmentSealedImpl::chunk_view_by_offsets(
"Can't get bitset element at " + std::to_string(field_id.get()));
if (auto it = fields_.find(field_id); it != fields_.end()) {
auto& field_data = it->second;
return field_data->ViewsByOffsets(offsets);
return field_data->StringViewsByOffsets(offsets);
}
PanicInfo(
ErrorCode::UnexpectedError,
"chunk_string_views_by_offsets only used for variable column field ");
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
SegmentSealedImpl::chunk_array_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const {
std::shared_lock lck(mutex_);
AssertInfo(get_bit(field_data_ready_bitset_, field_id),
"Can't get bitset element at " + std::to_string(field_id.get()));
if (auto it = fields_.find(field_id); it != fields_.end()) {
auto& field_data = it->second;
return field_data->ArrayViewsByOffsets(offsets);
}
PanicInfo(ErrorCode::UnexpectedError,
"chunk_view_by_offsets only used for variable column field ");
"chunk_array_views_by_offsets only used for array column field ");
}
const index::IndexBase*

View File

@ -246,9 +246,16 @@ class SegmentSealedImpl : public SegmentSealed {
std::optional<std::pair<int64_t, int64_t>> offset_len) const override;
std::pair<std::vector<std::string_view>, FixedVector<bool>>
chunk_view_by_offsets(FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
chunk_string_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
std::pair<std::vector<ArrayView>, FixedVector<bool>>
chunk_array_views_by_offsets(
FieldId field_id,
int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override;
std::pair<BufferView, FixedVector<bool>>
get_chunk_buffer(FieldId field_id,

View File

@ -354,7 +354,7 @@ func NewSegment(ctx context.Context,
logger.Warn("create segment failed", zap.Error(err))
return nil, err
}
log.Info("create segment done")
logger.Info("create segment done")
segment := &LocalSegment{
baseSegment: base,