milvus/internal/core/src/index/StringIndexMarisa.h
congqixia 3123093dd7
enhance: Use MARISA_LABEL_ORDER when building trie index (#36034)
Related to #35941
Previous PR: #35943

This PR make `Trie` index using `MARISA_LABEL_ORDER`, which make
predictive search iterating in lexicographic order.

When trie index is build in label order, lexicographc could be utilized
accelerating `Range` operations.

However according to the official document, using `MARISA_LABEL_ORDER`
will make "exact match lookup, common prefix search, and predictive
search" slower.

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
2024-09-09 14:29:05 +08:00

140 lines
3.6 KiB
C++

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <marisa.h>
#include "index/StringIndex.h"
#include <string>
#include <vector>
#include <map>
#include <memory>
#include "storage/MemFileManagerImpl.h"
namespace milvus::index {
class StringIndexMarisa : public StringIndex {
public:
explicit StringIndexMarisa(
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
int64_t
Size() override;
BinarySet
Serialize(const Config& config) override;
void
Load(const BinarySet& set, const Config& config = {}) override;
void
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) override;
int64_t
Count() override {
return str_ids_.size();
}
ScalarIndexType
GetIndexType() const override {
return ScalarIndexType::MARISA;
}
void
Build(size_t n, const std::string* values) override;
void
Build(const Config& config = {}) override;
void
BuildWithFieldData(const std::vector<FieldDataPtr>& field_datas) override;
const TargetBitmap
In(size_t n, const std::string* values) override;
const TargetBitmap
NotIn(size_t n, const std::string* values) override;
const TargetBitmap
IsNull() override;
const TargetBitmap
IsNotNull() override;
const TargetBitmap
Range(std::string value, OpType op) override;
const TargetBitmap
Range(std::string lower_bound_value,
bool lb_inclusive,
std::string upper_bound_value,
bool ub_inclusive) override;
const TargetBitmap
PrefixMatch(const std::string_view prefix) override;
std::string
Reverse_Lookup(size_t offset) const override;
BinarySet
Upload(const Config& config = {}) override;
const bool
HasRawData() const override {
return true;
}
private:
void
fill_str_ids(size_t n, const std::string* values);
void
fill_offsets();
// get str_id by str, if str not found, -1 was returned.
size_t
lookup(const std::string_view str);
std::vector<size_t>
prefix_match(const std::string_view prefix);
bool
in_lexicographic_order();
void
LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) override;
private:
Config config_;
marisa::Trie trie_;
std::vector<size_t> str_ids_; // used to retrieve.
std::map<size_t, std::vector<size_t>> str_ids_to_offsets_;
bool built_ = false;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
};
using StringIndexMarisaPtr = std::unique_ptr<StringIndexMarisa>;
inline StringIndexPtr
CreateStringIndexMarisa(
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext()) {
return std::make_unique<StringIndexMarisa>(file_manager_context);
}
} // namespace milvus::index