mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
enhance: allow many segments for inverted index (#35616)
fix: https://github.com/milvus-io/milvus/issues/35615 --------- Signed-off-by: longjiquan <jiquan.long@zilliz.com>
This commit is contained in:
parent
d8aa01bc1a
commit
a52ba3d09d
@ -420,10 +420,10 @@ InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
|
|||||||
// only used in ut.
|
// only used in ut.
|
||||||
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
||||||
for (size_t i = 0; i < n; i++) {
|
for (size_t i = 0; i < n; i++) {
|
||||||
wrapper_->template add_multi_data(arr[i].data(), arr[i].size());
|
wrapper_->template add_multi_data(arr[i].data(), arr[i].size(), i);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
wrapper_->add_data<T>(static_cast<const T*>(values), n);
|
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0);
|
||||||
}
|
}
|
||||||
finish();
|
finish();
|
||||||
}
|
}
|
||||||
@ -449,20 +449,27 @@ InvertedIndexTantivy<T>::BuildWithFieldData(
|
|||||||
case proto::schema::DataType::Double:
|
case proto::schema::DataType::Double:
|
||||||
case proto::schema::DataType::String:
|
case proto::schema::DataType::String:
|
||||||
case proto::schema::DataType::VarChar: {
|
case proto::schema::DataType::VarChar: {
|
||||||
for (const auto& data : field_datas) {
|
int64_t offset = 0;
|
||||||
auto n = data->get_num_rows();
|
if (schema_.nullable()) {
|
||||||
if (schema_.nullable()) {
|
for (const auto& data : field_datas) {
|
||||||
|
auto n = data->get_num_rows();
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
if (!data->is_valid(i)) {
|
if (!data->is_valid(i)) {
|
||||||
null_offset.push_back(i);
|
null_offset.push_back(i);
|
||||||
}
|
}
|
||||||
wrapper_->add_multi_data<T>(
|
wrapper_->add_multi_data<T>(
|
||||||
static_cast<const T*>(data->RawValue(i)),
|
static_cast<const T*>(data->RawValue(i)),
|
||||||
data->is_valid(i));
|
data->is_valid(i),
|
||||||
|
offset++);
|
||||||
}
|
}
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
wrapper_->add_data<T>(static_cast<const T*>(data->Data()), n);
|
} else {
|
||||||
|
for (const auto& data : field_datas) {
|
||||||
|
auto n = data->get_num_rows();
|
||||||
|
wrapper_->add_data<T>(
|
||||||
|
static_cast<const T*>(data->Data()), n, offset);
|
||||||
|
offset += n;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -483,6 +490,7 @@ template <typename T>
|
|||||||
void
|
void
|
||||||
InvertedIndexTantivy<T>::build_index_for_array(
|
InvertedIndexTantivy<T>::build_index_for_array(
|
||||||
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
|
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
|
||||||
|
int64_t offset = 0;
|
||||||
for (const auto& data : field_datas) {
|
for (const auto& data : field_datas) {
|
||||||
auto n = data->get_num_rows();
|
auto n = data->get_num_rows();
|
||||||
auto array_column = static_cast<const Array*>(data->Data());
|
auto array_column = static_cast<const Array*>(data->Data());
|
||||||
@ -494,7 +502,9 @@ InvertedIndexTantivy<T>::build_index_for_array(
|
|||||||
}
|
}
|
||||||
auto length = data->is_valid(i) ? array_column[i].length() : 0;
|
auto length = data->is_valid(i) ? array_column[i].length() : 0;
|
||||||
wrapper_->template add_multi_data(
|
wrapper_->template add_multi_data(
|
||||||
reinterpret_cast<const T*>(array_column[i].data()), length);
|
reinterpret_cast<const T*>(array_column[i].data()),
|
||||||
|
length,
|
||||||
|
offset++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -503,6 +513,7 @@ template <>
|
|||||||
void
|
void
|
||||||
InvertedIndexTantivy<std::string>::build_index_for_array(
|
InvertedIndexTantivy<std::string>::build_index_for_array(
|
||||||
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
|
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
|
||||||
|
int64_t offset = 0;
|
||||||
for (const auto& data : field_datas) {
|
for (const auto& data : field_datas) {
|
||||||
auto n = data->get_num_rows();
|
auto n = data->get_num_rows();
|
||||||
auto array_column = static_cast<const Array*>(data->Data());
|
auto array_column = static_cast<const Array*>(data->Data());
|
||||||
@ -519,7 +530,7 @@ InvertedIndexTantivy<std::string>::build_index_for_array(
|
|||||||
array_column[i].template get_data<std::string>(j));
|
array_column[i].template get_data<std::string>(j));
|
||||||
}
|
}
|
||||||
auto length = data->is_valid(i) ? output.size() : 0;
|
auto length = data->is_valid(i) ? output.size() : 0;
|
||||||
wrapper_->template add_multi_data(output.data(), length);
|
wrapper_->template add_multi_data(output.data(), length, offset++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,7 +34,7 @@ add_custom_target(tantivy_binding_target DEPENDS compile_tantivy ls_cargo_target
|
|||||||
|
|
||||||
set(INSTALL_COMMAND
|
set(INSTALL_COMMAND
|
||||||
cp ${LIB_HEADER_FOLDER}/tantivy-binding.h ${TANTIVY_INCLUDE_DIR}/ &&
|
cp ${LIB_HEADER_FOLDER}/tantivy-binding.h ${TANTIVY_INCLUDE_DIR}/ &&
|
||||||
cp ${CMAKE_CURRENT_SOURCE_DIR}/tantivy-wrapper.h ${TANTIVY_INCLUDE_DIR}/ &&
|
cp ${CMAKE_CURRENT_SOURCE_DIR}/*.h ${TANTIVY_INCLUDE_DIR}/ &&
|
||||||
cp ${LIB_FILE} ${TANTIVY_LIB_DIR}/)
|
cp ${LIB_FILE} ${TANTIVY_LIB_DIR}/)
|
||||||
add_custom_command(OUTPUT install_tantivy
|
add_custom_command(OUTPUT install_tantivy
|
||||||
COMMENT "Install tantivy target ${LIB_FILE} to ${TANTIVY_LIB_DIR}"
|
COMMENT "Install tantivy target ${LIB_FILE} to ${TANTIVY_LIB_DIR}"
|
||||||
|
|||||||
2
internal/core/thirdparty/tantivy/bench.cpp
vendored
2
internal/core/thirdparty/tantivy/bench.cpp
vendored
@ -29,7 +29,7 @@ build_index(size_t n = 1000000) {
|
|||||||
arr.push_back(std::to_string(x));
|
arr.push_back(std::to_string(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
w.add_data<std::string>(arr.data(), arr.size());
|
w.add_data<std::string>(arr.data(), arr.size(), 0);
|
||||||
|
|
||||||
w.finish();
|
w.finish();
|
||||||
assert(w.count() == n);
|
assert(w.count() == n);
|
||||||
|
|||||||
65
internal/core/thirdparty/tantivy/rust-array.h
vendored
Normal file
65
internal/core/thirdparty/tantivy/rust-array.h
vendored
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#include "tantivy-binding.h"
|
||||||
|
#include "rust-binding.h"
|
||||||
|
|
||||||
|
namespace milvus::tantivy {
|
||||||
|
|
||||||
|
struct RustArrayWrapper {
|
||||||
|
NO_COPY_OR_ASSIGN(RustArrayWrapper);
|
||||||
|
|
||||||
|
explicit RustArrayWrapper(RustArray array) : array_(array) {
|
||||||
|
}
|
||||||
|
|
||||||
|
RustArrayWrapper(RustArrayWrapper&& other) noexcept {
|
||||||
|
array_.array = other.array_.array;
|
||||||
|
array_.len = other.array_.len;
|
||||||
|
array_.cap = other.array_.cap;
|
||||||
|
other.array_.array = nullptr;
|
||||||
|
other.array_.len = 0;
|
||||||
|
other.array_.cap = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
RustArrayWrapper&
|
||||||
|
operator=(RustArrayWrapper&& other) noexcept {
|
||||||
|
if (this != &other) {
|
||||||
|
free();
|
||||||
|
array_.array = other.array_.array;
|
||||||
|
array_.len = other.array_.len;
|
||||||
|
array_.cap = other.array_.cap;
|
||||||
|
other.array_.array = nullptr;
|
||||||
|
other.array_.len = 0;
|
||||||
|
other.array_.cap = 0;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
~RustArrayWrapper() {
|
||||||
|
free();
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
debug() {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "[ ";
|
||||||
|
for (int i = 0; i < array_.len; i++) {
|
||||||
|
ss << array_.array[i] << " ";
|
||||||
|
}
|
||||||
|
ss << "]";
|
||||||
|
std::cout << ss.str() << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
RustArray array_;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void
|
||||||
|
free() {
|
||||||
|
if (array_.array != nullptr) {
|
||||||
|
free_rust_array(array_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace milvus::tantivy
|
||||||
7
internal/core/thirdparty/tantivy/rust-binding.h
vendored
Normal file
7
internal/core/thirdparty/tantivy/rust-binding.h
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace milvus::tantivy {
|
||||||
|
#define NO_COPY_OR_ASSIGN(ClassName) \
|
||||||
|
ClassName(const ClassName&) = delete; \
|
||||||
|
ClassName& operator=(const ClassName&) = delete;
|
||||||
|
} // namespace milvus::tantivy
|
||||||
@ -23,6 +23,8 @@ extern "C" {
|
|||||||
|
|
||||||
void free_rust_array(RustArray array);
|
void free_rust_array(RustArray array);
|
||||||
|
|
||||||
|
void print_vector_of_strings(const char *const *ptr, uintptr_t len);
|
||||||
|
|
||||||
void *tantivy_load_index(const char *path);
|
void *tantivy_load_index(const char *path);
|
||||||
|
|
||||||
void tantivy_free_index_reader(void *ptr);
|
void tantivy_free_index_reader(void *ptr);
|
||||||
@ -75,46 +77,51 @@ RustArray tantivy_prefix_query_keyword(void *ptr, const char *prefix);
|
|||||||
|
|
||||||
RustArray tantivy_regex_query(void *ptr, const char *pattern);
|
RustArray tantivy_regex_query(void *ptr, const char *pattern);
|
||||||
|
|
||||||
void *tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path);
|
void *tantivy_create_index(const char *field_name,
|
||||||
|
TantivyDataType data_type,
|
||||||
|
const char *path,
|
||||||
|
uintptr_t num_threads,
|
||||||
|
uintptr_t overall_memory_budget_in_bytes);
|
||||||
|
|
||||||
void tantivy_free_index_writer(void *ptr);
|
void tantivy_free_index_writer(void *ptr);
|
||||||
|
|
||||||
void tantivy_finish_index(void *ptr);
|
void tantivy_finish_index(void *ptr);
|
||||||
|
|
||||||
void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len);
|
void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len, int64_t offset_begin);
|
||||||
|
|
||||||
void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len);
|
void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len, int64_t offset_begin);
|
||||||
|
|
||||||
void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len);
|
void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len, int64_t offset_begin);
|
||||||
|
|
||||||
void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len);
|
void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len, int64_t offset_begin);
|
||||||
|
|
||||||
void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len);
|
void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len, int64_t offset_begin);
|
||||||
|
|
||||||
void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len);
|
void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len, int64_t offset_begin);
|
||||||
|
|
||||||
void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len);
|
void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len, int64_t offset_begin);
|
||||||
|
|
||||||
void tantivy_index_add_keyword(void *ptr, const char *s);
|
void tantivy_index_add_string(void *ptr, const char *s, int64_t offset);
|
||||||
|
|
||||||
void tantivy_index_add_multi_int8s(void *ptr, const int8_t *array, uintptr_t len);
|
void tantivy_index_add_multi_int8s(void *ptr, const int8_t *array, uintptr_t len, int64_t offset);
|
||||||
|
|
||||||
void tantivy_index_add_multi_int16s(void *ptr, const int16_t *array, uintptr_t len);
|
void tantivy_index_add_multi_int16s(void *ptr, const int16_t *array, uintptr_t len, int64_t offset);
|
||||||
|
|
||||||
void tantivy_index_add_multi_int32s(void *ptr, const int32_t *array, uintptr_t len);
|
void tantivy_index_add_multi_int32s(void *ptr, const int32_t *array, uintptr_t len, int64_t offset);
|
||||||
|
|
||||||
void tantivy_index_add_multi_int64s(void *ptr, const int64_t *array, uintptr_t len);
|
void tantivy_index_add_multi_int64s(void *ptr, const int64_t *array, uintptr_t len, int64_t offset);
|
||||||
|
|
||||||
void tantivy_index_add_multi_f32s(void *ptr, const float *array, uintptr_t len);
|
void tantivy_index_add_multi_f32s(void *ptr, const float *array, uintptr_t len, int64_t offset);
|
||||||
|
|
||||||
void tantivy_index_add_multi_f64s(void *ptr, const double *array, uintptr_t len);
|
void tantivy_index_add_multi_f64s(void *ptr, const double *array, uintptr_t len, int64_t offset);
|
||||||
|
|
||||||
void tantivy_index_add_multi_bools(void *ptr, const bool *array, uintptr_t len);
|
void tantivy_index_add_multi_bools(void *ptr, const bool *array, uintptr_t len, int64_t offset);
|
||||||
|
|
||||||
void tantivy_index_add_multi_keywords(void *ptr, const char *const *array, uintptr_t len);
|
void tantivy_index_add_multi_keywords(void *ptr,
|
||||||
|
const char *const *array,
|
||||||
|
uintptr_t len,
|
||||||
|
int64_t offset);
|
||||||
|
|
||||||
bool tantivy_index_exist(const char *path);
|
bool tantivy_index_exist(const char *path);
|
||||||
|
|
||||||
void print_vector_of_strings(const char *const *ptr, uintptr_t len);
|
|
||||||
|
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
|
|||||||
@ -1,14 +1,13 @@
|
|||||||
use std::{ffi::{c_char, CStr}, slice};
|
use std::{
|
||||||
|
ffi::{c_char, CStr},
|
||||||
|
slice,
|
||||||
|
};
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn print_vector_of_strings(ptr: *const *const c_char, len: usize) {
|
pub extern "C" fn print_vector_of_strings(ptr: *const *const c_char, len: usize) {
|
||||||
let arr : &[*const c_char] = unsafe {
|
let arr: &[*const c_char] = unsafe { slice::from_raw_parts(ptr, len) };
|
||||||
slice::from_raw_parts(ptr, len)
|
for element in arr {
|
||||||
};
|
let c_str = unsafe { CStr::from_ptr(*element) };
|
||||||
for element in arr {
|
println!("{}", c_str.to_str().unwrap());
|
||||||
let c_str = unsafe {
|
}
|
||||||
CStr::from_ptr(*element)
|
}
|
||||||
};
|
|
||||||
println!("{}", c_str.to_str().unwrap());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
60
internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs
vendored
Normal file
60
internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs
vendored
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
use tantivy::{
|
||||||
|
collector::{Collector, SegmentCollector},
|
||||||
|
fastfield::Column,
|
||||||
|
DocId, Score, SegmentOrdinal, SegmentReader,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub(crate) struct DocIdCollector;
|
||||||
|
|
||||||
|
impl Collector for DocIdCollector {
|
||||||
|
type Fruit = Vec<u32>;
|
||||||
|
type Child = DocIdChildCollector;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
_segment_local_id: SegmentOrdinal,
|
||||||
|
segment: &SegmentReader,
|
||||||
|
) -> tantivy::Result<Self::Child> {
|
||||||
|
Ok(DocIdChildCollector {
|
||||||
|
docs: Vec::new(),
|
||||||
|
column: segment.fast_fields().i64("doc_id").unwrap(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(
|
||||||
|
&self,
|
||||||
|
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||||
|
) -> tantivy::Result<Self::Fruit> {
|
||||||
|
let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum();
|
||||||
|
let mut result = Vec::with_capacity(len);
|
||||||
|
for docs in segment_fruits {
|
||||||
|
for doc in docs {
|
||||||
|
result.push(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct DocIdChildCollector {
|
||||||
|
docs: Vec<u32>,
|
||||||
|
column: Column<i64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for DocIdChildCollector {
|
||||||
|
type Fruit = Vec<u32>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||||
|
self.column.values_for_doc(doc).for_each(|doc_id| {
|
||||||
|
self.docs.push(doc_id as u32);
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> Self::Fruit {
|
||||||
|
self.docs
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,62 +1,82 @@
|
|||||||
use std::ops::Bound;
|
use std::ops::Bound;
|
||||||
use std::str::FromStr;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use tantivy::directory::MmapDirectory;
|
|
||||||
use tantivy::query::{Query, RangeQuery, RegexQuery, TermQuery};
|
use tantivy::query::{Query, RangeQuery, RegexQuery, TermQuery};
|
||||||
use tantivy::schema::{Field, IndexRecordOption};
|
use tantivy::schema::{Field, IndexRecordOption};
|
||||||
use tantivy::{Index, IndexReader, ReloadPolicy, Term};
|
use tantivy::{Index, IndexReader, ReloadPolicy, Term};
|
||||||
|
|
||||||
|
use crate::docid_collector::DocIdCollector;
|
||||||
use crate::log::init_log;
|
use crate::log::init_log;
|
||||||
use crate::util::make_bounds;
|
use crate::util::make_bounds;
|
||||||
use crate::vec_collector::VecCollector;
|
use crate::vec_collector::VecCollector;
|
||||||
|
|
||||||
pub struct IndexReaderWrapper {
|
pub(crate) struct IndexReaderWrapper {
|
||||||
pub field_name: String,
|
pub(crate) field_name: String,
|
||||||
pub field: Field,
|
pub(crate) field: Field,
|
||||||
pub reader: IndexReader,
|
pub(crate) reader: IndexReader,
|
||||||
pub cnt: u32,
|
pub(crate) index: Arc<Index>,
|
||||||
|
pub(crate) id_field: Option<Field>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexReaderWrapper {
|
impl IndexReaderWrapper {
|
||||||
pub fn new(index: &Index, field_name: &String, field: Field) -> IndexReaderWrapper {
|
pub fn load(path: &str) -> IndexReaderWrapper {
|
||||||
init_log();
|
init_log();
|
||||||
|
|
||||||
|
let index = Index::open_in_dir(path).unwrap();
|
||||||
|
|
||||||
|
IndexReaderWrapper::from_index(Arc::new(index))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_index(index: Arc<Index>) -> IndexReaderWrapper {
|
||||||
|
let field = index.schema().fields().next().unwrap().0;
|
||||||
|
let schema = index.schema();
|
||||||
|
let field_name = String::from(schema.get_field_name(field));
|
||||||
|
let id_field: Option<Field> = match schema.get_field("doc_id") {
|
||||||
|
Ok(field) => Some(field),
|
||||||
|
Err(_) => None,
|
||||||
|
};
|
||||||
|
|
||||||
let reader = index
|
let reader = index
|
||||||
.reader_builder()
|
.reader_builder()
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
.reload_policy(ReloadPolicy::OnCommit) // OnCommit serve for growing segment.
|
||||||
.try_into()
|
.try_into()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let metas = index.searchable_segment_metas().unwrap();
|
reader.reload().unwrap();
|
||||||
|
|
||||||
|
IndexReaderWrapper {
|
||||||
|
field_name,
|
||||||
|
field,
|
||||||
|
reader,
|
||||||
|
index,
|
||||||
|
id_field,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reload(&self) {
|
||||||
|
self.reader.reload().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn count(&self) -> u32 {
|
||||||
|
let metas = self.index.searchable_segment_metas().unwrap();
|
||||||
let mut sum: u32 = 0;
|
let mut sum: u32 = 0;
|
||||||
for meta in metas {
|
for meta in metas {
|
||||||
sum += meta.max_doc();
|
sum += meta.max_doc();
|
||||||
}
|
}
|
||||||
reader.reload().unwrap();
|
sum
|
||||||
IndexReaderWrapper {
|
|
||||||
field_name: field_name.to_string(),
|
|
||||||
field,
|
|
||||||
reader,
|
|
||||||
cnt: sum,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn load(path: &str) -> IndexReaderWrapper {
|
pub(crate) fn search(&self, q: &dyn Query) -> Vec<u32> {
|
||||||
let dir = MmapDirectory::open(path).unwrap();
|
|
||||||
let index = Index::open(dir).unwrap();
|
|
||||||
let field = index.schema().fields().next().unwrap().0;
|
|
||||||
let schema = index.schema();
|
|
||||||
let field_name = schema.get_field_name(field);
|
|
||||||
IndexReaderWrapper::new(&index, &String::from_str(field_name).unwrap(), field)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn count(&self) -> u32 {
|
|
||||||
self.cnt
|
|
||||||
}
|
|
||||||
|
|
||||||
fn search(&self, q: &dyn Query) -> Vec<u32> {
|
|
||||||
let searcher = self.reader.searcher();
|
let searcher = self.reader.searcher();
|
||||||
let hits = searcher.search(q, &VecCollector).unwrap();
|
match self.id_field {
|
||||||
hits
|
Some(_) => {
|
||||||
|
// newer version with doc_id.
|
||||||
|
searcher.search(q, &DocIdCollector {}).unwrap()
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// older version without doc_id, only one segment.
|
||||||
|
searcher.search(q, &VecCollector {}).unwrap()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn term_query_i64(&self, term: i64) -> Vec<u32> {
|
pub fn term_query_i64(&self, term: i64) -> Vec<u32> {
|
||||||
|
|||||||
@ -1,23 +1,33 @@
|
|||||||
use std::ffi::CStr;
|
use std::ffi::CStr;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use futures::executor::block_on;
|
||||||
use libc::c_char;
|
use libc::c_char;
|
||||||
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED};
|
use tantivy::schema::{
|
||||||
use tantivy::{doc, tokenizer, Index, SingleSegmentIndexWriter, Document};
|
Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED,
|
||||||
|
};
|
||||||
|
use tantivy::{doc, tokenizer, Document, Index, IndexWriter};
|
||||||
|
|
||||||
use crate::data_type::TantivyDataType;
|
use crate::data_type::TantivyDataType;
|
||||||
|
|
||||||
|
use crate::index_reader::IndexReaderWrapper;
|
||||||
use crate::log::init_log;
|
use crate::log::init_log;
|
||||||
|
|
||||||
pub struct IndexWriterWrapper {
|
pub(crate) struct IndexWriterWrapper {
|
||||||
pub field_name: String,
|
pub(crate) field: Field,
|
||||||
pub field: Field,
|
pub(crate) index_writer: IndexWriter,
|
||||||
pub data_type: TantivyDataType,
|
pub(crate) id_field: Field,
|
||||||
pub path: String,
|
pub(crate) index: Arc<Index>,
|
||||||
pub index_writer: SingleSegmentIndexWriter,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexWriterWrapper {
|
impl IndexWriterWrapper {
|
||||||
pub fn new(field_name: String, data_type: TantivyDataType, path: String) -> IndexWriterWrapper {
|
pub fn new(
|
||||||
|
field_name: String,
|
||||||
|
data_type: TantivyDataType,
|
||||||
|
path: String,
|
||||||
|
num_threads: usize,
|
||||||
|
overall_memory_budget_in_bytes: usize,
|
||||||
|
) -> IndexWriterWrapper {
|
||||||
init_log();
|
init_log();
|
||||||
|
|
||||||
let field: Field;
|
let field: Field;
|
||||||
@ -42,6 +52,7 @@ impl IndexWriterWrapper {
|
|||||||
use_raw_tokenizer = true;
|
use_raw_tokenizer = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
let id_field = schema_builder.add_i64_field("doc_id", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_dir(path.clone(), schema).unwrap();
|
let index = Index::create_in_dir(path.clone(), schema).unwrap();
|
||||||
if use_raw_tokenizer {
|
if use_raw_tokenizer {
|
||||||
@ -49,126 +60,166 @@ impl IndexWriterWrapper {
|
|||||||
.tokenizers()
|
.tokenizers()
|
||||||
.register("raw_tokenizer", tokenizer::RawTokenizer::default());
|
.register("raw_tokenizer", tokenizer::RawTokenizer::default());
|
||||||
}
|
}
|
||||||
let index_writer = SingleSegmentIndexWriter::new(index, 15 * 1024 * 1024).unwrap();
|
let index_writer = index
|
||||||
|
.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)
|
||||||
|
.unwrap();
|
||||||
IndexWriterWrapper {
|
IndexWriterWrapper {
|
||||||
field_name,
|
|
||||||
field,
|
field,
|
||||||
data_type,
|
|
||||||
path,
|
|
||||||
index_writer,
|
index_writer,
|
||||||
|
id_field,
|
||||||
|
index: Arc::new(index),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i8(&mut self, data: i8) {
|
pub fn add_i8(&mut self, data: i8, offset: i64) {
|
||||||
self.add_i64(data.into())
|
self.add_i64(data.into(), offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i16(&mut self, data: i16) {
|
pub fn add_i16(&mut self, data: i16, offset: i64) {
|
||||||
self.add_i64(data.into())
|
self.add_i64(data.into(), offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i32(&mut self, data: i32) {
|
pub fn add_i32(&mut self, data: i32, offset: i64) {
|
||||||
self.add_i64(data.into())
|
self.add_i64(data.into(), offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i64(&mut self, data: i64) {
|
pub fn add_i64(&mut self, data: i64, offset: i64) {
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.add_document(doc!(self.field => data))
|
.add_document(doc!(
|
||||||
|
self.field => data,
|
||||||
|
self.id_field => offset,
|
||||||
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_f32(&mut self, data: f32) {
|
pub fn add_f32(&mut self, data: f32, offset: i64) {
|
||||||
self.add_f64(data.into())
|
self.add_f64(data.into(), offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_f64(&mut self, data: f64) {
|
pub fn add_f64(&mut self, data: f64, offset: i64) {
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.add_document(doc!(self.field => data))
|
.add_document(doc!(
|
||||||
|
self.field => data,
|
||||||
|
self.id_field => offset,
|
||||||
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_bool(&mut self, data: bool) {
|
pub fn add_bool(&mut self, data: bool, offset: i64) {
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.add_document(doc!(self.field => data))
|
.add_document(doc!(
|
||||||
|
self.field => data,
|
||||||
|
self.id_field => offset,
|
||||||
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_keyword(&mut self, data: &str) {
|
pub fn add_string(&mut self, data: &str, offset: i64) {
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.add_document(doc!(self.field => data))
|
.add_document(doc!(
|
||||||
|
self.field => data,
|
||||||
|
self.id_field => offset,
|
||||||
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_i8s(&mut self, datas: &[i8]) {
|
pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data as i64);
|
document.add_field_value(self.field, *data as i64);
|
||||||
}
|
}
|
||||||
|
document.add_i64(self.id_field, offset);
|
||||||
self.index_writer.add_document(document).unwrap();
|
self.index_writer.add_document(document).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_i16s(&mut self, datas: &[i16]) {
|
pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data as i64);
|
document.add_field_value(self.field, *data as i64);
|
||||||
}
|
}
|
||||||
|
document.add_i64(self.id_field, offset);
|
||||||
self.index_writer.add_document(document).unwrap();
|
self.index_writer.add_document(document).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_i32s(&mut self, datas: &[i32]) {
|
pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data as i64);
|
document.add_field_value(self.field, *data as i64);
|
||||||
}
|
}
|
||||||
|
document.add_i64(self.id_field, offset);
|
||||||
self.index_writer.add_document(document).unwrap();
|
self.index_writer.add_document(document).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_i64s(&mut self, datas: &[i64]) {
|
pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data);
|
document.add_field_value(self.field, *data);
|
||||||
}
|
}
|
||||||
|
document.add_i64(self.id_field, offset);
|
||||||
self.index_writer.add_document(document).unwrap();
|
self.index_writer.add_document(document).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_f32s(&mut self, datas: &[f32]) {
|
pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data as f64);
|
document.add_field_value(self.field, *data as f64);
|
||||||
}
|
}
|
||||||
|
document.add_i64(self.id_field, offset);
|
||||||
self.index_writer.add_document(document).unwrap();
|
self.index_writer.add_document(document).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_f64s(&mut self, datas: &[f64]) {
|
pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data);
|
document.add_field_value(self.field, *data);
|
||||||
}
|
}
|
||||||
|
document.add_i64(self.id_field, offset);
|
||||||
self.index_writer.add_document(document).unwrap();
|
self.index_writer.add_document(document).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_bools(&mut self, datas: &[bool]) {
|
pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data);
|
document.add_field_value(self.field, *data);
|
||||||
}
|
}
|
||||||
|
document.add_i64(self.id_field, offset);
|
||||||
self.index_writer.add_document(document).unwrap();
|
self.index_writer.add_document(document).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_keywords(&mut self, datas: &[*const c_char]) {
|
pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
for element in datas {
|
for element in datas {
|
||||||
let data = unsafe {
|
let data = unsafe { CStr::from_ptr(*element) };
|
||||||
CStr::from_ptr(*element)
|
|
||||||
};
|
|
||||||
document.add_field_value(self.field, data.to_str().unwrap());
|
document.add_field_value(self.field, data.to_str().unwrap());
|
||||||
}
|
}
|
||||||
|
document.add_i64(self.id_field, offset);
|
||||||
self.index_writer.add_document(document).unwrap();
|
self.index_writer.add_document(document).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(self) {
|
fn manual_merge(&mut self) {
|
||||||
self.index_writer
|
let metas = self
|
||||||
.finalize()
|
.index_writer
|
||||||
.expect("failed to build inverted index");
|
.index()
|
||||||
|
.searchable_segment_metas()
|
||||||
|
.unwrap();
|
||||||
|
let policy = self.index_writer.get_merge_policy();
|
||||||
|
let candidates = policy.compute_merge_candidates(metas.as_slice());
|
||||||
|
for candidate in candidates {
|
||||||
|
self.index_writer
|
||||||
|
.merge(candidate.0.as_slice())
|
||||||
|
.wait()
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn finish(mut self) {
|
||||||
|
self.index_writer.commit().unwrap();
|
||||||
|
// self.manual_merge();
|
||||||
|
block_on(self.index_writer.garbage_collect_files()).unwrap();
|
||||||
|
self.index_writer.wait_merging_threads().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn commit(&mut self) {
|
||||||
|
self.index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -12,6 +12,8 @@ pub extern "C" fn tantivy_create_index(
|
|||||||
field_name: *const c_char,
|
field_name: *const c_char,
|
||||||
data_type: TantivyDataType,
|
data_type: TantivyDataType,
|
||||||
path: *const c_char,
|
path: *const c_char,
|
||||||
|
num_threads: usize,
|
||||||
|
overall_memory_budget_in_bytes: usize,
|
||||||
) -> *mut c_void {
|
) -> *mut c_void {
|
||||||
let field_name_str = unsafe { CStr::from_ptr(field_name) };
|
let field_name_str = unsafe { CStr::from_ptr(field_name) };
|
||||||
let path_str = unsafe { CStr::from_ptr(path) };
|
let path_str = unsafe { CStr::from_ptr(path) };
|
||||||
@ -19,6 +21,8 @@ pub extern "C" fn tantivy_create_index(
|
|||||||
String::from(field_name_str.to_str().unwrap()),
|
String::from(field_name_str.to_str().unwrap()),
|
||||||
data_type,
|
data_type,
|
||||||
String::from(path_str.to_str().unwrap()),
|
String::from(path_str.to_str().unwrap()),
|
||||||
|
num_threads,
|
||||||
|
overall_memory_budget_in_bytes,
|
||||||
);
|
);
|
||||||
create_binding(wrapper)
|
create_binding(wrapper)
|
||||||
}
|
}
|
||||||
@ -38,78 +42,113 @@ pub extern "C" fn tantivy_finish_index(ptr: *mut c_void) {
|
|||||||
|
|
||||||
// -------------------------build--------------------
|
// -------------------------build--------------------
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_int8s(ptr: *mut c_void, array: *const i8, len: usize) {
|
pub extern "C" fn tantivy_index_add_int8s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i8,
|
||||||
|
len: usize,
|
||||||
|
offset_begin: i64,
|
||||||
|
) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
unsafe {
|
unsafe {
|
||||||
for data in arr {
|
for (index, data) in arr.iter().enumerate() {
|
||||||
(*real).add_i8(*data);
|
(*real).add_i8(*data, offset_begin + (index as i64));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_int16s(ptr: *mut c_void, array: *const i16, len: usize) {
|
pub extern "C" fn tantivy_index_add_int16s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i16,
|
||||||
|
len: usize,
|
||||||
|
offset_begin: i64,
|
||||||
|
) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
unsafe {
|
unsafe {
|
||||||
for data in arr {
|
for (index, data) in arr.iter().enumerate() {
|
||||||
(*real).add_i16(*data);
|
(*real).add_i16(*data, offset_begin + (index as i64));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_int32s(ptr: *mut c_void, array: *const i32, len: usize) {
|
pub extern "C" fn tantivy_index_add_int32s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i32,
|
||||||
|
len: usize,
|
||||||
|
offset_begin: i64,
|
||||||
|
) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
unsafe {
|
unsafe {
|
||||||
for data in arr {
|
for (index, data) in arr.iter().enumerate() {
|
||||||
(*real).add_i32(*data);
|
(*real).add_i32(*data, offset_begin + (index as i64));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_int64s(ptr: *mut c_void, array: *const i64, len: usize) {
|
pub extern "C" fn tantivy_index_add_int64s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i64,
|
||||||
|
len: usize,
|
||||||
|
offset_begin: i64,
|
||||||
|
) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
unsafe {
|
unsafe {
|
||||||
for data in arr {
|
for (index, data) in arr.iter().enumerate() {
|
||||||
(*real).add_i64(*data);
|
(*real).add_i64(*data, offset_begin + (index as i64));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_f32s(ptr: *mut c_void, array: *const f32, len: usize) {
|
pub extern "C" fn tantivy_index_add_f32s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const f32,
|
||||||
|
len: usize,
|
||||||
|
offset_begin: i64,
|
||||||
|
) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
unsafe {
|
unsafe {
|
||||||
for data in arr {
|
for (index, data) in arr.iter().enumerate() {
|
||||||
(*real).add_f32(*data);
|
(*real).add_f32(*data, offset_begin + (index as i64));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_f64s(ptr: *mut c_void, array: *const f64, len: usize) {
|
pub extern "C" fn tantivy_index_add_f64s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const f64,
|
||||||
|
len: usize,
|
||||||
|
offset_begin: i64,
|
||||||
|
) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
unsafe {
|
unsafe {
|
||||||
for data in arr {
|
for (index, data) in arr.iter().enumerate() {
|
||||||
(*real).add_f64(*data);
|
(*real).add_f64(*data, offset_begin + (index as i64));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_bools(ptr: *mut c_void, array: *const bool, len: usize) {
|
pub extern "C" fn tantivy_index_add_bools(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const bool,
|
||||||
|
len: usize,
|
||||||
|
offset_begin: i64,
|
||||||
|
) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
unsafe {
|
unsafe {
|
||||||
for data in arr {
|
for (index, data) in arr.iter().enumerate() {
|
||||||
(*real).add_bool(*data);
|
(*real).add_bool(*data, offset_begin + (index as i64));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -117,82 +156,122 @@ pub extern "C" fn tantivy_index_add_bools(ptr: *mut c_void, array: *const bool,
|
|||||||
// TODO: this is not a very efficient way, since we must call this function many times, which
|
// TODO: this is not a very efficient way, since we must call this function many times, which
|
||||||
// will bring a lot of overhead caused by the rust binding.
|
// will bring a lot of overhead caused by the rust binding.
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_keyword(ptr: *mut c_void, s: *const c_char) {
|
pub extern "C" fn tantivy_index_add_string(ptr: *mut c_void, s: *const c_char, offset: i64) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
let c_str = unsafe { CStr::from_ptr(s) };
|
let c_str = unsafe { CStr::from_ptr(s) };
|
||||||
unsafe { (*real).add_keyword(c_str.to_str().unwrap()) }
|
unsafe { (*real).add_string(c_str.to_str().unwrap(), offset) }
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------- array ------------------------------------------
|
// --------------------------------------------- array ------------------------------------------
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_int8s(ptr: *mut c_void, array: *const i8, len: usize) {
|
pub extern "C" fn tantivy_index_add_multi_int8s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i8,
|
||||||
|
len: usize,
|
||||||
|
offset: i64,
|
||||||
|
) {
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
unsafe {
|
unsafe {
|
||||||
let arr = slice::from_raw_parts(array, len);
|
let arr = slice::from_raw_parts(array, len);
|
||||||
(*real).add_multi_i8s(arr)
|
(*real).add_multi_i8s(arr, offset)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_int16s(ptr: *mut c_void, array: *const i16, len: usize) {
|
pub extern "C" fn tantivy_index_add_multi_int16s(
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
ptr: *mut c_void,
|
||||||
unsafe {
|
array: *const i16,
|
||||||
let arr = slice::from_raw_parts(array, len) ;
|
len: usize,
|
||||||
(*real).add_multi_i16s(arr);
|
offset: i64,
|
||||||
}
|
) {
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn tantivy_index_add_multi_int32s(ptr: *mut c_void, array: *const i32, len: usize) {
|
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
|
||||||
unsafe {
|
|
||||||
let arr = slice::from_raw_parts(array, len) ;
|
|
||||||
(*real).add_multi_i32s(arr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn tantivy_index_add_multi_int64s(ptr: *mut c_void, array: *const i64, len: usize) {
|
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
|
||||||
unsafe {
|
|
||||||
let arr = slice::from_raw_parts(array, len) ;
|
|
||||||
(*real).add_multi_i64s(arr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn tantivy_index_add_multi_f32s(ptr: *mut c_void, array: *const f32, len: usize) {
|
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
|
||||||
unsafe {
|
|
||||||
let arr = slice::from_raw_parts(array, len) ;
|
|
||||||
(*real).add_multi_f32s(arr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn tantivy_index_add_multi_f64s(ptr: *mut c_void, array: *const f64, len: usize) {
|
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
|
||||||
unsafe {
|
|
||||||
let arr = slice::from_raw_parts(array, len) ;
|
|
||||||
(*real).add_multi_f64s(arr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn tantivy_index_add_multi_bools(ptr: *mut c_void, array: *const bool, len: usize) {
|
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
|
||||||
unsafe {
|
|
||||||
let arr = slice::from_raw_parts(array, len) ;
|
|
||||||
(*real).add_multi_bools(arr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn tantivy_index_add_multi_keywords(ptr: *mut c_void, array: *const *const c_char, len: usize) {
|
|
||||||
let real = ptr as *mut IndexWriterWrapper;
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
unsafe {
|
unsafe {
|
||||||
let arr = slice::from_raw_parts(array, len);
|
let arr = slice::from_raw_parts(array, len);
|
||||||
(*real).add_multi_keywords(arr)
|
(*real).add_multi_i16s(arr, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_int32s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i32,
|
||||||
|
len: usize,
|
||||||
|
offset: i64,
|
||||||
|
) {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = slice::from_raw_parts(array, len);
|
||||||
|
(*real).add_multi_i32s(arr, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_int64s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i64,
|
||||||
|
len: usize,
|
||||||
|
offset: i64,
|
||||||
|
) {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = slice::from_raw_parts(array, len);
|
||||||
|
(*real).add_multi_i64s(arr, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_f32s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const f32,
|
||||||
|
len: usize,
|
||||||
|
offset: i64,
|
||||||
|
) {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = slice::from_raw_parts(array, len);
|
||||||
|
(*real).add_multi_f32s(arr, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_f64s(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const f64,
|
||||||
|
len: usize,
|
||||||
|
offset: i64,
|
||||||
|
) {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = slice::from_raw_parts(array, len);
|
||||||
|
(*real).add_multi_f64s(arr, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_bools(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const bool,
|
||||||
|
len: usize,
|
||||||
|
offset: i64,
|
||||||
|
) {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = slice::from_raw_parts(array, len);
|
||||||
|
(*real).add_multi_bools(arr, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_keywords(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const *const c_char,
|
||||||
|
len: usize,
|
||||||
|
offset: i64,
|
||||||
|
) {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = slice::from_raw_parts(array, len);
|
||||||
|
(*real).add_multi_keywords(arr, offset)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
mod array;
|
mod array;
|
||||||
mod data_type;
|
mod data_type;
|
||||||
|
mod demo_c;
|
||||||
|
mod docid_collector;
|
||||||
mod hashset_collector;
|
mod hashset_collector;
|
||||||
mod index_reader;
|
mod index_reader;
|
||||||
mod index_reader_c;
|
mod index_reader_c;
|
||||||
@ -10,7 +12,6 @@ mod log;
|
|||||||
mod util;
|
mod util;
|
||||||
mod util_c;
|
mod util_c;
|
||||||
mod vec_collector;
|
mod vec_collector;
|
||||||
mod demo_c;
|
|
||||||
|
|
||||||
pub fn add(left: usize, right: usize) -> usize {
|
pub fn add(left: usize, right: usize) -> usize {
|
||||||
left + right
|
left + right
|
||||||
|
|||||||
136
internal/core/thirdparty/tantivy/tantivy-wrapper.h
vendored
136
internal/core/thirdparty/tantivy/tantivy-wrapper.h
vendored
@ -1,66 +1,18 @@
|
|||||||
|
#include <assert.h>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
#include "tantivy-binding.h"
|
#include "tantivy-binding.h"
|
||||||
|
#include "rust-binding.h"
|
||||||
|
#include "rust-array.h"
|
||||||
|
|
||||||
namespace milvus::tantivy {
|
namespace milvus::tantivy {
|
||||||
struct RustArrayWrapper {
|
static constexpr uintptr_t DEFAULT_NUM_THREADS = 4;
|
||||||
explicit RustArrayWrapper(RustArray array) : array_(array) {
|
static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES =
|
||||||
}
|
DEFAULT_NUM_THREADS * 15 * 1024 * 1024;
|
||||||
|
|
||||||
RustArrayWrapper(RustArrayWrapper&) = delete;
|
|
||||||
RustArrayWrapper&
|
|
||||||
operator=(RustArrayWrapper&) = delete;
|
|
||||||
|
|
||||||
RustArrayWrapper(RustArrayWrapper&& other) noexcept {
|
|
||||||
array_.array = other.array_.array;
|
|
||||||
array_.len = other.array_.len;
|
|
||||||
array_.cap = other.array_.cap;
|
|
||||||
other.array_.array = nullptr;
|
|
||||||
other.array_.len = 0;
|
|
||||||
other.array_.cap = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
RustArrayWrapper&
|
|
||||||
operator=(RustArrayWrapper&& other) noexcept {
|
|
||||||
if (this != &other) {
|
|
||||||
free();
|
|
||||||
array_.array = other.array_.array;
|
|
||||||
array_.len = other.array_.len;
|
|
||||||
array_.cap = other.array_.cap;
|
|
||||||
other.array_.array = nullptr;
|
|
||||||
other.array_.len = 0;
|
|
||||||
other.array_.cap = 0;
|
|
||||||
}
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
~RustArrayWrapper() {
|
|
||||||
free();
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
debug() {
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << "[ ";
|
|
||||||
for (int i = 0; i < array_.len; i++) {
|
|
||||||
ss << array_.array[i] << " ";
|
|
||||||
}
|
|
||||||
ss << "]";
|
|
||||||
std::cout << ss.str() << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
RustArray array_;
|
|
||||||
|
|
||||||
private:
|
|
||||||
void
|
|
||||||
free() {
|
|
||||||
if (array_.array != nullptr) {
|
|
||||||
free_rust_array(array_);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline TantivyDataType
|
inline TantivyDataType
|
||||||
@ -81,15 +33,14 @@ guess_data_type() {
|
|||||||
typeid(T).name());
|
typeid(T).name());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: should split this into IndexWriter & IndexReader.
|
||||||
struct TantivyIndexWrapper {
|
struct TantivyIndexWrapper {
|
||||||
using IndexWriter = void*;
|
using IndexWriter = void*;
|
||||||
using IndexReader = void*;
|
using IndexReader = void*;
|
||||||
|
|
||||||
TantivyIndexWrapper() = default;
|
NO_COPY_OR_ASSIGN(TantivyIndexWrapper);
|
||||||
|
|
||||||
TantivyIndexWrapper(TantivyIndexWrapper&) = delete;
|
TantivyIndexWrapper() = default;
|
||||||
TantivyIndexWrapper&
|
|
||||||
operator=(TantivyIndexWrapper&) = delete;
|
|
||||||
|
|
||||||
TantivyIndexWrapper(TantivyIndexWrapper&& other) noexcept {
|
TantivyIndexWrapper(TantivyIndexWrapper&& other) noexcept {
|
||||||
writer_ = other.writer_;
|
writer_ = other.writer_;
|
||||||
@ -120,11 +71,19 @@ struct TantivyIndexWrapper {
|
|||||||
|
|
||||||
TantivyIndexWrapper(const char* field_name,
|
TantivyIndexWrapper(const char* field_name,
|
||||||
TantivyDataType data_type,
|
TantivyDataType data_type,
|
||||||
const char* path) {
|
const char* path,
|
||||||
writer_ = tantivy_create_index(field_name, data_type, path);
|
uintptr_t num_threads = DEFAULT_NUM_THREADS,
|
||||||
|
uintptr_t overall_memory_budget_in_bytes =
|
||||||
|
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
|
||||||
|
writer_ = tantivy_create_index(field_name,
|
||||||
|
data_type,
|
||||||
|
path,
|
||||||
|
num_threads,
|
||||||
|
overall_memory_budget_in_bytes);
|
||||||
path_ = std::string(path);
|
path_ = std::string(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// load index. create index reader.
|
||||||
explicit TantivyIndexWrapper(const char* path) {
|
explicit TantivyIndexWrapper(const char* path) {
|
||||||
assert(tantivy_index_exist(path));
|
assert(tantivy_index_exist(path));
|
||||||
reader_ = tantivy_load_index(path);
|
reader_ = tantivy_load_index(path);
|
||||||
@ -137,49 +96,51 @@ struct TantivyIndexWrapper {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void
|
void
|
||||||
add_data(const T* array, uintptr_t len) {
|
add_data(const T* array, uintptr_t len, int64_t offset_begin) {
|
||||||
assert(!finished_);
|
assert(!finished_);
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, bool>) {
|
if constexpr (std::is_same_v<T, bool>) {
|
||||||
tantivy_index_add_bools(writer_, array, len);
|
tantivy_index_add_bools(writer_, array, len, offset_begin);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, int8_t>) {
|
if constexpr (std::is_same_v<T, int8_t>) {
|
||||||
tantivy_index_add_int8s(writer_, array, len);
|
tantivy_index_add_int8s(writer_, array, len, offset_begin);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, int16_t>) {
|
if constexpr (std::is_same_v<T, int16_t>) {
|
||||||
tantivy_index_add_int16s(writer_, array, len);
|
tantivy_index_add_int16s(writer_, array, len, offset_begin);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, int32_t>) {
|
if constexpr (std::is_same_v<T, int32_t>) {
|
||||||
tantivy_index_add_int32s(writer_, array, len);
|
tantivy_index_add_int32s(writer_, array, len, offset_begin);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, int64_t>) {
|
if constexpr (std::is_same_v<T, int64_t>) {
|
||||||
tantivy_index_add_int64s(writer_, array, len);
|
tantivy_index_add_int64s(writer_, array, len, offset_begin);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, float>) {
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
tantivy_index_add_f32s(writer_, array, len);
|
tantivy_index_add_f32s(writer_, array, len, offset_begin);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, double>) {
|
if constexpr (std::is_same_v<T, double>) {
|
||||||
tantivy_index_add_f64s(writer_, array, len);
|
tantivy_index_add_f64s(writer_, array, len, offset_begin);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, std::string>) {
|
if constexpr (std::is_same_v<T, std::string>) {
|
||||||
// TODO: not very efficient, a lot of overhead due to rust-ffi call.
|
// TODO: not very efficient, a lot of overhead due to rust-ffi call.
|
||||||
for (uintptr_t i = 0; i < len; i++) {
|
for (uintptr_t i = 0; i < len; i++) {
|
||||||
tantivy_index_add_keyword(
|
tantivy_index_add_string(
|
||||||
writer_, static_cast<const std::string*>(array)[i].c_str());
|
writer_,
|
||||||
|
static_cast<const std::string*>(array)[i].c_str(),
|
||||||
|
offset_begin + i);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -190,41 +151,41 @@ struct TantivyIndexWrapper {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void
|
void
|
||||||
add_multi_data(const T* array, uintptr_t len) {
|
add_multi_data(const T* array, uintptr_t len, int64_t offset) {
|
||||||
assert(!finished_);
|
assert(!finished_);
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, bool>) {
|
if constexpr (std::is_same_v<T, bool>) {
|
||||||
tantivy_index_add_multi_bools(writer_, array, len);
|
tantivy_index_add_multi_bools(writer_, array, len, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, int8_t>) {
|
if constexpr (std::is_same_v<T, int8_t>) {
|
||||||
tantivy_index_add_multi_int8s(writer_, array, len);
|
tantivy_index_add_multi_int8s(writer_, array, len, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, int16_t>) {
|
if constexpr (std::is_same_v<T, int16_t>) {
|
||||||
tantivy_index_add_multi_int16s(writer_, array, len);
|
tantivy_index_add_multi_int16s(writer_, array, len, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, int32_t>) {
|
if constexpr (std::is_same_v<T, int32_t>) {
|
||||||
tantivy_index_add_multi_int32s(writer_, array, len);
|
tantivy_index_add_multi_int32s(writer_, array, len, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, int64_t>) {
|
if constexpr (std::is_same_v<T, int64_t>) {
|
||||||
tantivy_index_add_multi_int64s(writer_, array, len);
|
tantivy_index_add_multi_int64s(writer_, array, len, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, float>) {
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
tantivy_index_add_multi_f32s(writer_, array, len);
|
tantivy_index_add_multi_f32s(writer_, array, len, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (std::is_same_v<T, double>) {
|
if constexpr (std::is_same_v<T, double>) {
|
||||||
tantivy_index_add_multi_f64s(writer_, array, len);
|
tantivy_index_add_multi_f64s(writer_, array, len, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -233,7 +194,8 @@ struct TantivyIndexWrapper {
|
|||||||
for (uintptr_t i = 0; i < len; i++) {
|
for (uintptr_t i = 0; i < len; i++) {
|
||||||
views.push_back(array[i].c_str());
|
views.push_back(array[i].c_str());
|
||||||
}
|
}
|
||||||
tantivy_index_add_multi_keywords(writer_, views.data(), len);
|
tantivy_index_add_multi_keywords(
|
||||||
|
writer_, views.data(), len, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,12 +206,14 @@ struct TantivyIndexWrapper {
|
|||||||
|
|
||||||
inline void
|
inline void
|
||||||
finish() {
|
finish() {
|
||||||
if (!finished_) {
|
if (finished_) {
|
||||||
tantivy_finish_index(writer_);
|
return;
|
||||||
writer_ = nullptr;
|
|
||||||
reader_ = tantivy_load_index(path_.c_str());
|
|
||||||
finished_ = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tantivy_finish_index(writer_);
|
||||||
|
writer_ = nullptr;
|
||||||
|
reader_ = tantivy_load_index(path_.c_str());
|
||||||
|
finished_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint32_t
|
inline uint32_t
|
||||||
|
|||||||
14
internal/core/thirdparty/tantivy/test.cpp
vendored
14
internal/core/thirdparty/tantivy/test.cpp
vendored
@ -33,7 +33,7 @@ run() {
|
|||||||
T arr[] = {1, 2, 3, 4, 5, 6};
|
T arr[] = {1, 2, 3, 4, 5, 6};
|
||||||
auto l = sizeof(arr) / sizeof(T);
|
auto l = sizeof(arr) / sizeof(T);
|
||||||
|
|
||||||
w.add_data(arr, l);
|
w.add_data(arr, l, 0);
|
||||||
|
|
||||||
w.finish();
|
w.finish();
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ run<bool>() {
|
|||||||
bool arr[] = {true, false, false, true, false, true};
|
bool arr[] = {true, false, false, true, false, true};
|
||||||
auto l = sizeof(arr) / sizeof(bool);
|
auto l = sizeof(arr) / sizeof(bool);
|
||||||
|
|
||||||
w.add_data(arr, l);
|
w.add_data(arr, l, 0);
|
||||||
|
|
||||||
w.finish();
|
w.finish();
|
||||||
|
|
||||||
@ -118,7 +118,7 @@ run<std::string>() {
|
|||||||
std::vector<std::string> arr = {"a", "b", "aaa", "abbb"};
|
std::vector<std::string> arr = {"a", "b", "aaa", "abbb"};
|
||||||
auto l = arr.size();
|
auto l = arr.size();
|
||||||
|
|
||||||
w.add_data<std::string>(arr.data(), l);
|
w.add_data<std::string>(arr.data(), l, 0);
|
||||||
|
|
||||||
w.finish();
|
w.finish();
|
||||||
|
|
||||||
@ -188,7 +188,7 @@ test_32717() {
|
|||||||
inverted[n].insert(i);
|
inverted[n].insert(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
w.add_data(arr.data(), l);
|
w.add_data(arr.data(), l, 0);
|
||||||
w.finish();
|
w.finish();
|
||||||
assert(w.count() == l);
|
assert(w.count() == l);
|
||||||
|
|
||||||
@ -233,8 +233,9 @@ test_array_int() {
|
|||||||
{10, 50, 60},
|
{10, 50, 60},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
int64_t offset = 0;
|
||||||
for (const auto& arr : vec_of_array) {
|
for (const auto& arr : vec_of_array) {
|
||||||
w.add_multi_data(arr.data(), arr.size());
|
w.add_multi_data(arr.data(), arr.size(), offset++);
|
||||||
}
|
}
|
||||||
w.finish();
|
w.finish();
|
||||||
|
|
||||||
@ -263,8 +264,9 @@ test_array_string() {
|
|||||||
{"10", "50", "60"},
|
{"10", "50", "60"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
int64_t offset = 0;
|
||||||
for (const auto& arr : vec_of_array) {
|
for (const auto& arr : vec_of_array) {
|
||||||
w.add_multi_data(arr.data(), arr.size());
|
w.add_multi_data(arr.data(), arr.size(), offset++);
|
||||||
}
|
}
|
||||||
w.finish();
|
w.finish();
|
||||||
|
|
||||||
|
|||||||
@ -230,14 +230,16 @@ class ArrayBitmapIndexTest : public testing::Test {
|
|||||||
config["insert_files"] = std::vector<std::string>{log_path};
|
config["insert_files"] = std::vector<std::string>{log_path};
|
||||||
config["bitmap_cardinality_limit"] = "100";
|
config["bitmap_cardinality_limit"] = "100";
|
||||||
|
|
||||||
auto build_index =
|
{
|
||||||
indexbuilder::IndexFactory::GetInstance().CreateIndex(
|
auto build_index =
|
||||||
DataType::ARRAY, config, ctx);
|
indexbuilder::IndexFactory::GetInstance().CreateIndex(
|
||||||
build_index->Build();
|
DataType::ARRAY, config, ctx);
|
||||||
|
build_index->Build();
|
||||||
|
|
||||||
auto binary_set = build_index->Upload();
|
auto binary_set = build_index->Upload();
|
||||||
for (const auto& [key, _] : binary_set.binary_map_) {
|
for (const auto& [key, _] : binary_set.binary_map_) {
|
||||||
index_files.push_back(key);
|
index_files.push_back(key);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
index::CreateIndexInfo index_info{};
|
index::CreateIndexInfo index_info{};
|
||||||
|
|||||||
@ -143,14 +143,16 @@ class HybridIndexTestV1 : public testing::Test {
|
|||||||
config["insert_files"] = std::vector<std::string>{log_path};
|
config["insert_files"] = std::vector<std::string>{log_path};
|
||||||
config["bitmap_cardinality_limit"] = "1000";
|
config["bitmap_cardinality_limit"] = "1000";
|
||||||
|
|
||||||
auto build_index =
|
{
|
||||||
indexbuilder::IndexFactory::GetInstance().CreateIndex(
|
auto build_index =
|
||||||
type_, config, ctx);
|
indexbuilder::IndexFactory::GetInstance().CreateIndex(
|
||||||
build_index->Build();
|
type_, config, ctx);
|
||||||
|
build_index->Build();
|
||||||
|
|
||||||
auto binary_set = build_index->Upload();
|
auto binary_set = build_index->Upload();
|
||||||
for (const auto& [key, _] : binary_set.binary_map_) {
|
for (const auto& [key, _] : binary_set.binary_map_) {
|
||||||
index_files.push_back(key);
|
index_files.push_back(key);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
index::CreateIndexInfo index_info{};
|
index::CreateIndexInfo index_info{};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user