From 035a508722d6543b6d2d59196d7bd1b4b58297de Mon Sep 17 00:00:00 2001 From: Jiquan Long Date: Wed, 8 May 2024 21:25:30 +0800 Subject: [PATCH] fix: make sure inverted index has only one segment (#32858) issue: #32717 --------- Signed-off-by: longjiquan --- .../tantivy/tantivy-binding/Cargo.lock | 98 +++++++++++++++++++ .../tantivy/tantivy-binding/Cargo.toml | 2 + .../tantivy-binding/src/index_reader.rs | 3 + .../tantivy-binding/src/index_reader_c.rs | 5 +- .../tantivy-binding/src/index_writer.rs | 34 ++++--- .../tantivy/tantivy-binding/src/lib.rs | 1 + .../tantivy/tantivy-binding/src/log.rs | 10 ++ .../tantivy-binding/src/vec_collector.rs | 5 + internal/core/thirdparty/tantivy/test.cpp | 50 ++++++++++ 9 files changed, 188 insertions(+), 20 deletions(-) create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/log.rs diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock index b4614f4138..4ed3a35e4b 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock @@ -29,6 +29,55 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" +[[package]] +name = "anstream" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" + +[[package]] +name = "anstyle-parse" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + [[package]] name = "arc-swap" version = "1.7.1" @@ -167,6 +216,12 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "colorchoice" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" + [[package]] name = "crc32fast" version = "1.4.0" @@ -238,6 +293,29 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +[[package]] +name = "env_filter" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "errno" version = "0.3.8" @@ -432,6 +510,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "indexmap" version = "1.9.3" @@ -454,6 +538,12 @@ dependencies = [ "web-sys", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + [[package]] name = "itertools" version = "0.11.0" @@ -978,8 +1068,10 @@ name = "tantivy-binding" version = "0.1.0" dependencies = [ "cbindgen", + "env_logger", "futures", "libc", + "log", "scopeguard", "tantivy", "zstd-sys", @@ -1243,6 +1335,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + [[package]] name = "uuid" version = "1.8.0" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml index 246895c28c..12de291c5b 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml @@ -11,6 +11,8 @@ futures = "0.3.21" libc = "0.2" scopeguard = "1.2" zstd-sys = "=2.0.9" +env_logger = "0.11.3" +log = "0.4.21" [build-dependencies] cbindgen = "0.26.0" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs index 27f6c61100..8a5600bf17 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs @@ -6,6 +6,7 @@ use tantivy::query::{Query, RangeQuery, RegexQuery, TermQuery}; use tantivy::schema::{Field, IndexRecordOption}; use tantivy::{Index, IndexReader, ReloadPolicy, Term}; +use crate::log::init_log; use crate::util::make_bounds; use crate::vec_collector::VecCollector; @@ -18,6 +19,8 @@ pub struct IndexReaderWrapper { impl IndexReaderWrapper { pub fn new(index: &Index, field_name: &String, field: Field) -> IndexReaderWrapper { + init_log(); + let reader = index .reader_builder() .reload_policy(ReloadPolicy::Manual) diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_c.rs index 9b427c2908..b7165cf26f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_c.rs @@ -212,10 +212,7 @@ pub extern "C" fn tantivy_prefix_query_keyword( } #[no_mangle] -pub extern "C" fn tantivy_regex_query( - ptr: *mut c_void, - pattern: *const c_char, -) -> RustArray { +pub extern "C" fn tantivy_regex_query(ptr: *mut c_void, pattern: *const c_char) -> RustArray { let real = ptr as *mut IndexReaderWrapper; unsafe { let c_str = CStr::from_ptr(pattern); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs index 96a466ad51..ce96a5b4d5 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs @@ -1,21 +1,24 @@ use futures::executor::block_on; use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED}; -use tantivy::{doc, tokenizer, Index, IndexWriter}; +use tantivy::{doc, tokenizer, Index, IndexWriter, SingleSegmentIndexWriter}; use crate::data_type::TantivyDataType; +use crate::index_writer; +use crate::log::init_log; pub struct IndexWriterWrapper { pub field_name: String, pub field: Field, pub data_type: TantivyDataType, pub path: String, - pub index: Index, - pub index_writer: IndexWriter, + pub index_writer: SingleSegmentIndexWriter, } impl IndexWriterWrapper { pub fn new(field_name: String, data_type: TantivyDataType, path: String) -> IndexWriterWrapper { + init_log(); + let field: Field; let mut schema_builder = Schema::builder(); let mut use_raw_tokenizer = false; @@ -45,60 +48,59 @@ impl IndexWriterWrapper { .tokenizers() .register("raw_tokenizer", tokenizer::RawTokenizer::default()); } - let index_writer = index.writer_with_num_threads(1, 15_000_000).unwrap(); + let index_writer = SingleSegmentIndexWriter::new(index, 15 * 1024 * 1024).unwrap(); IndexWriterWrapper { field_name, field, data_type, path, - index, index_writer, } } - pub fn add_i8(&self, data: i8) { + pub fn add_i8(&mut self, data: i8) { self.add_i64(data.into()) } - pub fn add_i16(&self, data: i16) { + pub fn add_i16(&mut self, data: i16) { self.add_i64(data.into()) } - pub fn add_i32(&self, data: i32) { + pub fn add_i32(&mut self, data: i32) { self.add_i64(data.into()) } - pub fn add_i64(&self, data: i64) { + pub fn add_i64(&mut self, data: i64) { self.index_writer .add_document(doc!(self.field => data)) .unwrap(); } - pub fn add_f32(&self, data: f32) { + pub fn add_f32(&mut self, data: f32) { self.add_f64(data.into()) } - pub fn add_f64(&self, data: f64) { + pub fn add_f64(&mut self, data: f64) { self.index_writer .add_document(doc!(self.field => data)) .unwrap(); } - pub fn add_bool(&self, data: bool) { + pub fn add_bool(&mut self, data: bool) { self.index_writer .add_document(doc!(self.field => data)) .unwrap(); } - pub fn add_keyword(&self, data: &str) { + pub fn add_keyword(&mut self, data: &str) { self.index_writer .add_document(doc!(self.field => data)) .unwrap(); } pub fn finish(mut self) { - self.index_writer.commit().unwrap(); - block_on(self.index_writer.garbage_collect_files()).unwrap(); - self.index_writer.wait_merging_threads().unwrap(); + self.index_writer + .finalize() + .expect("failed to build inverted index"); } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs index c9ae7235e6..aa069cb3b3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs @@ -6,6 +6,7 @@ mod index_reader_c; mod index_writer; mod index_writer_c; mod linkedlist_collector; +mod log; mod util; mod util_c; mod vec_collector; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/log.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/log.rs new file mode 100644 index 0000000000..112fa86217 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/log.rs @@ -0,0 +1,10 @@ +use env_logger::Env; +use std::sync::Once; + +pub(crate) fn init_log() { + static _INITIALIZED: Once = Once::new(); + _INITIALIZED.call_once(|| { + let _env = Env::default().filter_or("MY_LOG_LEVEL", "info"); + env_logger::init_from_env(_env); + }); +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs index 56261d77ab..73299f2477 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs @@ -1,3 +1,4 @@ +use log::warn; use tantivy::{ collector::{Collector, SegmentCollector}, DocId, @@ -26,6 +27,10 @@ impl Collector for VecCollector { if segment_fruits.len() == 1 { Ok(segment_fruits.into_iter().next().unwrap()) } else { + warn!( + "inverted index should have only one segment, but got {} segments", + segment_fruits.len() + ); let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum(); let mut result = Vec::with_capacity(len); for docs in segment_fruits { diff --git a/internal/core/thirdparty/tantivy/test.cpp b/internal/core/thirdparty/tantivy/test.cpp index 0b94142184..1c67a69673 100644 --- a/internal/core/thirdparty/tantivy/test.cpp +++ b/internal/core/thirdparty/tantivy/test.cpp @@ -2,6 +2,9 @@ #include #include #include +#include +#include +#include #include "tantivy-binding.h" #include "tantivy-wrapper.h" @@ -152,8 +155,55 @@ run() { } } +void +test_32717() { + using T = int16_t; + + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + + if (tantivy_index_exist(path)) { + auto w = TantivyIndexWrapper(path); + auto cnt = w.count(); + std::cout << "index already exist, open it, count: " << cnt + << std::endl; + return; + } + + auto w = TantivyIndexWrapper("test_field_name", guess_data_type(), path); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(1, 1000); + std::vector arr; + std::map> inverted; + size_t l = 1000000; + for (size_t i = 0; i < l; i++) { + auto n = static_cast(dis(gen)); + arr.push_back(n); + if (inverted.find(n) == inverted.end()) { + inverted[n] = std::set(); + } + inverted[n].insert(i); + } + + w.add_data(arr.data(), l); + w.finish(); + assert(w.count() == l); + + for (int16_t term = 1; term < 1000; term += 10) { + auto hits = w.term_query(term); + for (size_t i = 0; i < hits.array_.len; i++) { + assert(arr[hits.array_.array[i]] == term); + } + } +} + int main(int argc, char* argv[]) { + test_32717(); + run(); run(); run();