mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-08 01:58:34 +08:00
fix: make sure inverted index has only one segment (#32858)
issue: #32717 --------- Signed-off-by: longjiquan <jiquan.long@zilliz.com>
This commit is contained in:
parent
5037497929
commit
035a508722
98
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
98
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
@ -29,6 +29,55 @@ version = "0.2.16"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstream"
|
||||||
|
version = "0.6.14"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"anstyle-parse",
|
||||||
|
"anstyle-query",
|
||||||
|
"anstyle-wincon",
|
||||||
|
"colorchoice",
|
||||||
|
"is_terminal_polyfill",
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle"
|
||||||
|
version = "1.0.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-parse"
|
||||||
|
version = "0.2.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
|
||||||
|
dependencies = [
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-query"
|
||||||
|
version = "1.0.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-wincon"
|
||||||
|
version = "3.0.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"windows-sys 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arc-swap"
|
name = "arc-swap"
|
||||||
version = "1.7.1"
|
version = "1.7.1"
|
||||||
@ -167,6 +216,12 @@ dependencies = [
|
|||||||
"os_str_bytes",
|
"os_str_bytes",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colorchoice"
|
||||||
|
version = "1.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crc32fast"
|
name = "crc32fast"
|
||||||
version = "1.4.0"
|
version = "1.4.0"
|
||||||
@ -238,6 +293,29 @@ version = "1.10.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
|
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "env_filter"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"regex",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "env_logger"
|
||||||
|
version = "0.11.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9"
|
||||||
|
dependencies = [
|
||||||
|
"anstream",
|
||||||
|
"anstyle",
|
||||||
|
"env_filter",
|
||||||
|
"humantime",
|
||||||
|
"log",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "errno"
|
name = "errno"
|
||||||
version = "0.3.8"
|
version = "0.3.8"
|
||||||
@ -432,6 +510,12 @@ version = "0.3.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163"
|
checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "humantime"
|
||||||
|
version = "2.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "indexmap"
|
name = "indexmap"
|
||||||
version = "1.9.3"
|
version = "1.9.3"
|
||||||
@ -454,6 +538,12 @@ dependencies = [
|
|||||||
"web-sys",
|
"web-sys",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "is_terminal_polyfill"
|
||||||
|
version = "1.70.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itertools"
|
name = "itertools"
|
||||||
version = "0.11.0"
|
version = "0.11.0"
|
||||||
@ -978,8 +1068,10 @@ name = "tantivy-binding"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cbindgen",
|
"cbindgen",
|
||||||
|
"env_logger",
|
||||||
"futures",
|
"futures",
|
||||||
"libc",
|
"libc",
|
||||||
|
"log",
|
||||||
"scopeguard",
|
"scopeguard",
|
||||||
"tantivy",
|
"tantivy",
|
||||||
"zstd-sys",
|
"zstd-sys",
|
||||||
@ -1243,6 +1335,12 @@ version = "1.0.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba"
|
checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8parse"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "uuid"
|
name = "uuid"
|
||||||
version = "1.8.0"
|
version = "1.8.0"
|
||||||
|
|||||||
@ -11,6 +11,8 @@ futures = "0.3.21"
|
|||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
scopeguard = "1.2"
|
scopeguard = "1.2"
|
||||||
zstd-sys = "=2.0.9"
|
zstd-sys = "=2.0.9"
|
||||||
|
env_logger = "0.11.3"
|
||||||
|
log = "0.4.21"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
cbindgen = "0.26.0"
|
cbindgen = "0.26.0"
|
||||||
|
|||||||
@ -6,6 +6,7 @@ use tantivy::query::{Query, RangeQuery, RegexQuery, TermQuery};
|
|||||||
use tantivy::schema::{Field, IndexRecordOption};
|
use tantivy::schema::{Field, IndexRecordOption};
|
||||||
use tantivy::{Index, IndexReader, ReloadPolicy, Term};
|
use tantivy::{Index, IndexReader, ReloadPolicy, Term};
|
||||||
|
|
||||||
|
use crate::log::init_log;
|
||||||
use crate::util::make_bounds;
|
use crate::util::make_bounds;
|
||||||
use crate::vec_collector::VecCollector;
|
use crate::vec_collector::VecCollector;
|
||||||
|
|
||||||
@ -18,6 +19,8 @@ pub struct IndexReaderWrapper {
|
|||||||
|
|
||||||
impl IndexReaderWrapper {
|
impl IndexReaderWrapper {
|
||||||
pub fn new(index: &Index, field_name: &String, field: Field) -> IndexReaderWrapper {
|
pub fn new(index: &Index, field_name: &String, field: Field) -> IndexReaderWrapper {
|
||||||
|
init_log();
|
||||||
|
|
||||||
let reader = index
|
let reader = index
|
||||||
.reader_builder()
|
.reader_builder()
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
.reload_policy(ReloadPolicy::Manual)
|
||||||
|
|||||||
@ -212,10 +212,7 @@ pub extern "C" fn tantivy_prefix_query_keyword(
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_regex_query(
|
pub extern "C" fn tantivy_regex_query(ptr: *mut c_void, pattern: *const c_char) -> RustArray {
|
||||||
ptr: *mut c_void,
|
|
||||||
pattern: *const c_char,
|
|
||||||
) -> RustArray {
|
|
||||||
let real = ptr as *mut IndexReaderWrapper;
|
let real = ptr as *mut IndexReaderWrapper;
|
||||||
unsafe {
|
unsafe {
|
||||||
let c_str = CStr::from_ptr(pattern);
|
let c_str = CStr::from_ptr(pattern);
|
||||||
|
|||||||
@ -1,21 +1,24 @@
|
|||||||
use futures::executor::block_on;
|
use futures::executor::block_on;
|
||||||
|
|
||||||
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED};
|
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED};
|
||||||
use tantivy::{doc, tokenizer, Index, IndexWriter};
|
use tantivy::{doc, tokenizer, Index, IndexWriter, SingleSegmentIndexWriter};
|
||||||
|
|
||||||
use crate::data_type::TantivyDataType;
|
use crate::data_type::TantivyDataType;
|
||||||
|
use crate::index_writer;
|
||||||
|
use crate::log::init_log;
|
||||||
|
|
||||||
pub struct IndexWriterWrapper {
|
pub struct IndexWriterWrapper {
|
||||||
pub field_name: String,
|
pub field_name: String,
|
||||||
pub field: Field,
|
pub field: Field,
|
||||||
pub data_type: TantivyDataType,
|
pub data_type: TantivyDataType,
|
||||||
pub path: String,
|
pub path: String,
|
||||||
pub index: Index,
|
pub index_writer: SingleSegmentIndexWriter,
|
||||||
pub index_writer: IndexWriter,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexWriterWrapper {
|
impl IndexWriterWrapper {
|
||||||
pub fn new(field_name: String, data_type: TantivyDataType, path: String) -> IndexWriterWrapper {
|
pub fn new(field_name: String, data_type: TantivyDataType, path: String) -> IndexWriterWrapper {
|
||||||
|
init_log();
|
||||||
|
|
||||||
let field: Field;
|
let field: Field;
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let mut use_raw_tokenizer = false;
|
let mut use_raw_tokenizer = false;
|
||||||
@ -45,60 +48,59 @@ impl IndexWriterWrapper {
|
|||||||
.tokenizers()
|
.tokenizers()
|
||||||
.register("raw_tokenizer", tokenizer::RawTokenizer::default());
|
.register("raw_tokenizer", tokenizer::RawTokenizer::default());
|
||||||
}
|
}
|
||||||
let index_writer = index.writer_with_num_threads(1, 15_000_000).unwrap();
|
let index_writer = SingleSegmentIndexWriter::new(index, 15 * 1024 * 1024).unwrap();
|
||||||
IndexWriterWrapper {
|
IndexWriterWrapper {
|
||||||
field_name,
|
field_name,
|
||||||
field,
|
field,
|
||||||
data_type,
|
data_type,
|
||||||
path,
|
path,
|
||||||
index,
|
|
||||||
index_writer,
|
index_writer,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i8(&self, data: i8) {
|
pub fn add_i8(&mut self, data: i8) {
|
||||||
self.add_i64(data.into())
|
self.add_i64(data.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i16(&self, data: i16) {
|
pub fn add_i16(&mut self, data: i16) {
|
||||||
self.add_i64(data.into())
|
self.add_i64(data.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i32(&self, data: i32) {
|
pub fn add_i32(&mut self, data: i32) {
|
||||||
self.add_i64(data.into())
|
self.add_i64(data.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i64(&self, data: i64) {
|
pub fn add_i64(&mut self, data: i64) {
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.add_document(doc!(self.field => data))
|
.add_document(doc!(self.field => data))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_f32(&self, data: f32) {
|
pub fn add_f32(&mut self, data: f32) {
|
||||||
self.add_f64(data.into())
|
self.add_f64(data.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_f64(&self, data: f64) {
|
pub fn add_f64(&mut self, data: f64) {
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.add_document(doc!(self.field => data))
|
.add_document(doc!(self.field => data))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_bool(&self, data: bool) {
|
pub fn add_bool(&mut self, data: bool) {
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.add_document(doc!(self.field => data))
|
.add_document(doc!(self.field => data))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_keyword(&self, data: &str) {
|
pub fn add_keyword(&mut self, data: &str) {
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.add_document(doc!(self.field => data))
|
.add_document(doc!(self.field => data))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(mut self) {
|
pub fn finish(mut self) {
|
||||||
self.index_writer.commit().unwrap();
|
self.index_writer
|
||||||
block_on(self.index_writer.garbage_collect_files()).unwrap();
|
.finalize()
|
||||||
self.index_writer.wait_merging_threads().unwrap();
|
.expect("failed to build inverted index");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -6,6 +6,7 @@ mod index_reader_c;
|
|||||||
mod index_writer;
|
mod index_writer;
|
||||||
mod index_writer_c;
|
mod index_writer_c;
|
||||||
mod linkedlist_collector;
|
mod linkedlist_collector;
|
||||||
|
mod log;
|
||||||
mod util;
|
mod util;
|
||||||
mod util_c;
|
mod util_c;
|
||||||
mod vec_collector;
|
mod vec_collector;
|
||||||
|
|||||||
10
internal/core/thirdparty/tantivy/tantivy-binding/src/log.rs
vendored
Normal file
10
internal/core/thirdparty/tantivy/tantivy-binding/src/log.rs
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
use env_logger::Env;
|
||||||
|
use std::sync::Once;
|
||||||
|
|
||||||
|
pub(crate) fn init_log() {
|
||||||
|
static _INITIALIZED: Once = Once::new();
|
||||||
|
_INITIALIZED.call_once(|| {
|
||||||
|
let _env = Env::default().filter_or("MY_LOG_LEVEL", "info");
|
||||||
|
env_logger::init_from_env(_env);
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
use log::warn;
|
||||||
use tantivy::{
|
use tantivy::{
|
||||||
collector::{Collector, SegmentCollector},
|
collector::{Collector, SegmentCollector},
|
||||||
DocId,
|
DocId,
|
||||||
@ -26,6 +27,10 @@ impl Collector for VecCollector {
|
|||||||
if segment_fruits.len() == 1 {
|
if segment_fruits.len() == 1 {
|
||||||
Ok(segment_fruits.into_iter().next().unwrap())
|
Ok(segment_fruits.into_iter().next().unwrap())
|
||||||
} else {
|
} else {
|
||||||
|
warn!(
|
||||||
|
"inverted index should have only one segment, but got {} segments",
|
||||||
|
segment_fruits.len()
|
||||||
|
);
|
||||||
let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum();
|
let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum();
|
||||||
let mut result = Vec::with_capacity(len);
|
let mut result = Vec::with_capacity(len);
|
||||||
for docs in segment_fruits {
|
for docs in segment_fruits {
|
||||||
|
|||||||
50
internal/core/thirdparty/tantivy/test.cpp
vendored
50
internal/core/thirdparty/tantivy/test.cpp
vendored
@ -2,6 +2,9 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <random>
|
||||||
|
#include <set>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
#include "tantivy-binding.h"
|
#include "tantivy-binding.h"
|
||||||
#include "tantivy-wrapper.h"
|
#include "tantivy-wrapper.h"
|
||||||
@ -152,8 +155,55 @@ run<std::string>() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
test_32717() {
|
||||||
|
using T = int16_t;
|
||||||
|
|
||||||
|
auto path = "/tmp/inverted-index/test-binding/";
|
||||||
|
boost::filesystem::remove_all(path);
|
||||||
|
boost::filesystem::create_directories(path);
|
||||||
|
|
||||||
|
if (tantivy_index_exist(path)) {
|
||||||
|
auto w = TantivyIndexWrapper(path);
|
||||||
|
auto cnt = w.count();
|
||||||
|
std::cout << "index already exist, open it, count: " << cnt
|
||||||
|
<< std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto w = TantivyIndexWrapper("test_field_name", guess_data_type<T>(), path);
|
||||||
|
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937 gen(rd());
|
||||||
|
std::uniform_int_distribution<int> dis(1, 1000);
|
||||||
|
std::vector<int16_t> arr;
|
||||||
|
std::map<int16_t, std::set<int>> inverted;
|
||||||
|
size_t l = 1000000;
|
||||||
|
for (size_t i = 0; i < l; i++) {
|
||||||
|
auto n = static_cast<int16_t>(dis(gen));
|
||||||
|
arr.push_back(n);
|
||||||
|
if (inverted.find(n) == inverted.end()) {
|
||||||
|
inverted[n] = std::set<int>();
|
||||||
|
}
|
||||||
|
inverted[n].insert(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
w.add_data(arr.data(), l);
|
||||||
|
w.finish();
|
||||||
|
assert(w.count() == l);
|
||||||
|
|
||||||
|
for (int16_t term = 1; term < 1000; term += 10) {
|
||||||
|
auto hits = w.term_query(term);
|
||||||
|
for (size_t i = 0; i < hits.array_.len; i++) {
|
||||||
|
assert(arr[hits.array_.array[i]] == term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char* argv[]) {
|
main(int argc, char* argv[]) {
|
||||||
|
test_32717();
|
||||||
|
|
||||||
run<int8_t>();
|
run<int8_t>();
|
||||||
run<int16_t>();
|
run<int16_t>();
|
||||||
run<int32_t>();
|
run<int32_t>();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user