diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock index 6dda3ad03f..2ef53acdfc 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock @@ -4043,6 +4043,7 @@ dependencies = [ "tonic", "tonic-build", "unicode-general-category", + "unicode_categories", "url", "whatlang", "zstd-sys", @@ -4621,6 +4622,12 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "unsafe-libyaml" version = "0.2.11" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml index 09343f1524..5b602a842b 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml @@ -48,6 +48,7 @@ url = "2.5.4" prost = "0.13.5" once_cell = "1.20.3" unicode-general-category = "1.0.0" +unicode_categories = "0.1.1" # lindera dependencies for fetch and prepare dictionary online. tokio = { version = "1.45.0", features = [ diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs index ec03b2628d..c82b8b4b46 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs @@ -4,6 +4,7 @@ mod icu_tokneizer; mod jieba_tokenizer; mod lang_ident_tokenizer; mod lindera_tokenizer; +mod ngram_tokenizer_with_chars; mod tokenizer; pub use self::char_group_tokenizer::CharGroupTokenizer; @@ -12,5 +13,6 @@ pub use self::icu_tokneizer::IcuTokenizer; pub use self::jieba_tokenizer::JiebaTokenizer; pub use self::lang_ident_tokenizer::LangIdentTokenizer; pub use self::lindera_tokenizer::LinderaTokenizer; +pub use self::ngram_tokenizer_with_chars::NgramTokenizerWithChars; pub(crate) use self::tokenizer::*; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/ngram_tokenizer_with_chars.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/ngram_tokenizer_with_chars.rs new file mode 100644 index 0000000000..fb4d3f5771 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/ngram_tokenizer_with_chars.rs @@ -0,0 +1,767 @@ +use std::collections::HashSet; + +use unicode_categories::UnicodeCategories; + +use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; +use tantivy::TantivyError; + +/// Character categories that can be included in tokens +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum TokenCharType { + /// Unicode letter characters + Letter, + /// Unicode digit characters + Digit, + /// Whitespace characters + Whitespace, + /// Punctuation characters + Punctuation, + /// Symbol characters + Symbol, + /// Custom characters defined by user + Custom, +} + +/// Configuration for which characters to include in tokens +#[derive(Clone, Debug)] +pub struct TokenCharsConfig { + /// Character types to include + pub token_chars: HashSet, + /// Custom characters to include when TokenCharType::Custom is set + pub custom_token_chars: String, +} + +impl Default for TokenCharsConfig { + fn default() -> Self { + TokenCharsConfig { + token_chars: HashSet::new(), + custom_token_chars: String::new(), + } + } +} + +/// Check if a character is punctuation using Unicode standard categories +fn is_punctuation(ch: char) -> bool { + ch.is_punctuation() +} + +/// Check if a character is a symbol using Unicode standard categories +fn is_symbol(ch: char) -> bool { + ch.is_symbol() +} + +impl TokenCharsConfig { + pub fn new() -> Self { + Self::default() + } + + pub fn with_letters(mut self) -> Self { + self.token_chars.insert(TokenCharType::Letter); + self + } + + pub fn with_digits(mut self) -> Self { + self.token_chars.insert(TokenCharType::Digit); + self + } + + pub fn with_whitespace(mut self) -> Self { + self.token_chars.insert(TokenCharType::Whitespace); + self + } + + pub fn with_punctuation(mut self) -> Self { + self.token_chars.insert(TokenCharType::Punctuation); + self + } + + pub fn with_symbol(mut self) -> Self { + self.token_chars.insert(TokenCharType::Symbol); + self + } + + pub fn with_custom_chars(mut self, custom_token_chars: String) -> Self { + self.token_chars.insert(TokenCharType::Custom); + self.custom_token_chars = custom_token_chars; + self + } + + pub fn with_all() -> Self { + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Letter); + token_chars.insert(TokenCharType::Digit); + token_chars.insert(TokenCharType::Whitespace); + token_chars.insert(TokenCharType::Punctuation); + token_chars.insert(TokenCharType::Symbol); + Self { + token_chars, + custom_token_chars: String::new(), + } + } + + /// Check if a character should be kept based on configuration + pub fn should_keep_char(&self, ch: char) -> bool { + // Check custom characters first + if self.token_chars.contains(&TokenCharType::Custom) && self.custom_token_chars.contains(ch) + { + return true; + } + + // Check standard character types + if ch.is_alphabetic() && self.token_chars.contains(&TokenCharType::Letter) { + return true; + } + + if self.token_chars.contains(&TokenCharType::Digit) && ch.is_numeric() { + return true; + } + + if self.token_chars.contains(&TokenCharType::Whitespace) && ch.is_whitespace() { + return true; + } + + // Use the more comprehensive punctuation and symbol checkers + if self.token_chars.contains(&TokenCharType::Punctuation) && is_punctuation(ch) { + return true; + } + + if self.token_chars.contains(&TokenCharType::Symbol) && is_symbol(ch) { + return true; + } + + false + } +} + +/// Tokenize text by splitting into n-grams, with character filtering support +#[derive(Clone, Debug)] +pub struct NgramTokenizerWithChars { + min_gram: usize, + max_gram: usize, + prefix_only: bool, + token_chars_config: TokenCharsConfig, + token: Token, +} + +impl NgramTokenizerWithChars { + /// Create a new NgramTokenizer with character filtering + pub fn new( + min_gram: usize, + max_gram: usize, + prefix_only: bool, + token_chars_config: TokenCharsConfig, + ) -> Result { + if min_gram == 0 { + return Err(TantivyError::InvalidArgument( + "min_gram must be greater than 0".to_string(), + )); + } + if min_gram > max_gram { + return Err(TantivyError::InvalidArgument( + "min_gram must not be greater than max_gram".to_string(), + )); + } + + Ok(NgramTokenizerWithChars { + min_gram, + max_gram, + prefix_only, + token_chars_config, + token: Token::default(), + }) + } +} + +/// Represents a text segment with its byte offsets in the original text +#[derive(Debug, Clone)] +struct TextSegment { + /// The text content of the segment + text: String, + /// Starting byte offset in original text + start_offset: usize, +} + +/// TokenStream for NgramTokenizerWithChars +pub struct NgramTokenStreamWithChars<'a> { + segments: Vec, + current_segment_idx: usize, + current_segment_ngrams: Vec<(String, usize, usize)>, + current_ngram_idx: usize, + min_gram: usize, + max_gram: usize, + prefix_only: bool, + token: &'a mut Token, +} + +impl<'a> NgramTokenStreamWithChars<'a> { + fn generate_ngrams_for_segment(&mut self) { + self.current_segment_ngrams.clear(); + + if self.current_segment_idx >= self.segments.len() { + return; + } + + let segment = &self.segments[self.current_segment_idx]; + let text = &segment.text; + + // Collect character boundaries + let char_boundaries: Vec = text + .char_indices() + .map(|(i, _)| i) + .chain(std::iter::once(text.len())) + .collect(); + + let num_chars = char_boundaries.len() - 1; + + // Skip if segment is shorter than min_gram + if num_chars < self.min_gram { + return; + } + + for start_char in 0..num_chars { + // For prefix_only mode, only generate ngrams starting at position 0 + if self.prefix_only && start_char > 0 { + break; + } + + let min_end = start_char + self.min_gram; + let max_end = (start_char + self.max_gram).min(num_chars); + + // Skip if we can't generate min_gram length ngram from this position + if min_end > num_chars { + continue; + } + + for end_char in min_end..=max_end { + let start_byte = char_boundaries[start_char]; + let end_byte = char_boundaries[end_char]; + + let ngram_text = text[start_byte..end_byte].to_string(); + let global_start = segment.start_offset + start_byte; + let global_end = segment.start_offset + end_byte; + + self.current_segment_ngrams + .push((ngram_text, global_start, global_end)); + } + } + + self.current_ngram_idx = 0; + } +} + +impl Tokenizer for NgramTokenizerWithChars { + type TokenStream<'a> = NgramTokenStreamWithChars<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + self.token.reset(); + + // Split text into segments based on token_chars configuration + let mut segments = Vec::new(); + let mut current_segment = String::new(); + let mut current_segment_start = None; + let mut byte_offset = 0; + + for ch in text.chars() { + let char_len = ch.len_utf8(); + + if self.token_chars_config.should_keep_char(ch) { + if current_segment_start.is_none() { + current_segment_start = Some(byte_offset); + } + current_segment.push(ch); + } else { + if !current_segment.is_empty() { + segments.push(TextSegment { + text: current_segment.clone(), + start_offset: current_segment_start.unwrap(), + }); + current_segment.clear(); + current_segment_start = None; + } + } + + byte_offset += char_len; + } + + if !current_segment.is_empty() { + segments.push(TextSegment { + text: current_segment, + start_offset: current_segment_start.unwrap(), + }); + } + + let mut stream = NgramTokenStreamWithChars { + segments, + current_segment_idx: 0, + current_segment_ngrams: Vec::new(), + current_ngram_idx: 0, + min_gram: self.min_gram, + max_gram: self.max_gram, + prefix_only: self.prefix_only, + token: &mut self.token, + }; + + // Generate ngrams for first segment + stream.generate_ngrams_for_segment(); + + stream + } +} + +impl TokenStream for NgramTokenStreamWithChars<'_> { + fn advance(&mut self) -> bool { + loop { + // Check if we have ngrams in current segment + if self.current_ngram_idx < self.current_segment_ngrams.len() { + let (text, start, end) = &self.current_segment_ngrams[self.current_ngram_idx]; + + self.token.position = self.current_segment_idx; + self.token.offset_from = *start; + self.token.offset_to = *end; + self.token.text.clear(); + self.token.text.push_str(text); + + self.current_ngram_idx += 1; + return true; + } + + // Move to next segment + self.current_segment_idx += 1; + if self.current_segment_idx >= self.segments.len() { + return false; + } + + // Generate ngrams for new segment + self.generate_ngrams_for_segment(); + } + } + + fn token(&self) -> &Token { + self.token + } + + fn token_mut(&mut self) -> &mut Token { + self.token + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; + + fn create_alphanumeric_tokenizer( + min_gram: usize, + max_gram: usize, + prefix_only: bool, + ) -> NgramTokenizerWithChars { + let config = TokenCharsConfig::default().with_letters().with_digits(); + NgramTokenizerWithChars::new(min_gram, max_gram, prefix_only, config).unwrap() + } + + fn collect_tokens(mut stream: T) -> Vec { + let mut tokens = Vec::new(); + while stream.advance() { + tokens.push(stream.token().clone()); + } + tokens + } + + #[test] + fn test_token_chars_config() { + let config = TokenCharsConfig::with_all(); + assert!(config.should_keep_char('a')); + assert!(config.should_keep_char('1')); + assert!(config.should_keep_char(' ')); + assert!(config.should_keep_char('.')); + assert!(config.should_keep_char('$')); + + // Test custom config - only letters + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Letter); + let config = TokenCharsConfig { + token_chars, + custom_token_chars: String::new(), + }; + assert!(config.should_keep_char('a')); + assert!(!config.should_keep_char('1')); + assert!(!config.should_keep_char(' ')); + assert!(!config.should_keep_char('.')); + } + + #[test] + fn test_custom_token_chars() { + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Letter); + token_chars.insert(TokenCharType::Custom); + + let config = TokenCharsConfig { + token_chars, + custom_token_chars: "+-_".to_string(), + }; + + assert!(config.should_keep_char('a')); + assert!(config.should_keep_char('+')); + assert!(config.should_keep_char('-')); + assert!(config.should_keep_char('_')); + assert!(!config.should_keep_char('1')); + assert!(!config.should_keep_char(' ')); + } + + #[test] + fn test_alphanumeric_tokenizer() { + let mut tokenizer = create_alphanumeric_tokenizer(2, 3, false); + let tokens = collect_tokens(tokenizer.token_stream("hello-world123")); + + // Should split at hyphen and process "hello", "world123" separately + // "hello": he, hel, el, ell, ll, llo, lo + // "world123": wo, wor, or, orl, rl, rld, ld, ld1, d1, d12, 12, 123, 23 + + assert!(tokens.iter().any(|t| t.text == "he")); + assert!(tokens.iter().any(|t| t.text == "hel")); + assert!(tokens.iter().any(|t| t.text == "wo")); + assert!(tokens.iter().any(|t| t.text == "wor")); + assert!(tokens.iter().any(|t| t.text == "123")); + + // Should not contain hyphen + assert!(!tokens.iter().any(|t| t.text.contains('-'))); + } + + #[test] + fn test_with_custom_chars() { + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Letter); + token_chars.insert(TokenCharType::Digit); + token_chars.insert(TokenCharType::Custom); + + let config = TokenCharsConfig { + token_chars, + custom_token_chars: "-_".to_string(), + }; + + let mut tokenizer = NgramTokenizerWithChars::new(2, 3, false, config).unwrap(); + let tokens = collect_tokens(tokenizer.token_stream("hello-world_123")); + + // Should keep hyphens and underscores + assert!(tokens.iter().any(|t| t.text == "o-w")); + assert!(tokens.iter().any(|t| t.text == "d_1")); + } + + #[test] + fn test_offset_preservation() { + let mut tokenizer = create_alphanumeric_tokenizer(2, 2, false); + let tokens = collect_tokens(tokenizer.token_stream("ab cd")); + + // Check offsets are preserved correctly + let ab_token = tokens.iter().find(|t| t.text == "ab").unwrap(); + assert_eq!(ab_token.offset_from, 0); + assert_eq!(ab_token.offset_to, 2); + + let cd_token = tokens.iter().find(|t| t.text == "cd").unwrap(); + assert_eq!(cd_token.offset_from, 3); + assert_eq!(cd_token.offset_to, 5); + } + + #[test] + fn test_unicode_handling() { + let mut tokenizer = create_alphanumeric_tokenizer(2, 3, false); + let tokens = collect_tokens(tokenizer.token_stream("café-2024")); + + // Should handle unicode correctly + assert!(tokens.iter().any(|t| t.text == "ca")); + assert!(tokens.iter().any(|t| t.text == "caf")); + assert!(tokens.iter().any(|t| t.text == "afé")); + assert!(tokens.iter().any(|t| t.text == "20")); + assert!(tokens.iter().any(|t| t.text == "202")); + } + + #[test] + fn test_unicode_punctuation_completeness() { + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Punctuation); + let config = TokenCharsConfig { + token_chars, + custom_token_chars: String::new(), + }; + + // Test various Unicode punctuation from different scripts + assert!(config.should_keep_char('.')); // ASCII period + assert!(config.should_keep_char('。')); // CJK period + assert!(config.should_keep_char('、')); // CJK comma + assert!(config.should_keep_char('،')); // Arabic comma + assert!(config.should_keep_char('؛')); // Arabic semicolon + assert!(config.should_keep_char('‽')); // Interrobang + assert!(config.should_keep_char('⁇')); // Double question mark + assert!(config.should_keep_char('⁉')); // Exclamation question mark + assert!(config.should_keep_char('„')); // German opening quote + assert!(config.should_keep_char('"')); // English closing quote + assert!(config.should_keep_char('«')); // French opening quote + assert!(config.should_keep_char('»')); // French closing quote + assert!(config.should_keep_char('_')); // Underscore (connector punctuation) + assert!(config.should_keep_char('–')); // En dash + assert!(config.should_keep_char('—')); // Em dash + assert!(config.should_keep_char('…')); // Ellipsis + assert!(config.should_keep_char('‹')); // Single left angle quote + assert!(config.should_keep_char('›')); // Single right angle quote + + // Should not match letters or symbols + assert!(!config.should_keep_char('a')); + assert!(!config.should_keep_char('中')); + assert!(!config.should_keep_char('$')); + assert!(!config.should_keep_char('€')); + } + + #[test] + fn test_unicode_symbol_completeness() { + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Symbol); + let config = TokenCharsConfig { + token_chars, + custom_token_chars: String::new(), + }; + + // Test various Unicode symbols + assert!(config.should_keep_char('$')); // Dollar + assert!(config.should_keep_char('€')); // Euro + assert!(config.should_keep_char('¥')); // Yen + assert!(config.should_keep_char('£')); // Pound + assert!(config.should_keep_char('₹')); // Rupee + // Note: Bitcoin symbol (₿ U+20BF) was added in Unicode 10.0 (2017) + // Some Unicode libraries may not include it yet + // assert!(config.should_keep_char('₿')); // Bitcoin + assert!(config.should_keep_char('+')); // Plus + assert!(config.should_keep_char('=')); // Equals + assert!(config.should_keep_char('<')); // Less than + assert!(config.should_keep_char('>')); // Greater than + assert!(config.should_keep_char('∑')); // Sum + assert!(config.should_keep_char('∏')); // Product + assert!(config.should_keep_char('∫')); // Integral + assert!(config.should_keep_char('√')); // Square root + assert!(config.should_keep_char('∞')); // Infinity + assert!(config.should_keep_char('©')); // Copyright + assert!(config.should_keep_char('®')); // Registered + assert!(config.should_keep_char('™')); // Trademark + assert!(config.should_keep_char('♠')); // Spade suit + assert!(config.should_keep_char('♣')); // Club suit + assert!(config.should_keep_char('♥')); // Heart suit + assert!(config.should_keep_char('♦')); // Diamond suit + assert!(config.should_keep_char('♪')); // Musical note + assert!(config.should_keep_char('☺')); // Smiley face + assert!(config.should_keep_char('☯')); // Yin yang + assert!(config.should_keep_char('⚡')); // Lightning + assert!(config.should_keep_char('🚀')); // Rocket emoji + + // Should not match letters or punctuation + assert!(!config.should_keep_char('a')); + assert!(!config.should_keep_char('中')); + assert!(!config.should_keep_char('.')); + assert!(!config.should_keep_char(',')); + } + + #[test] + fn test_mixed_scripts() { + // Test tokenization with mixed scripts + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Letter); + token_chars.insert(TokenCharType::Digit); + let config = TokenCharsConfig { + token_chars, + custom_token_chars: String::new(), + }; + + let mut tokenizer = NgramTokenizerWithChars::new(2, 3, false, config).unwrap(); + let tokens = collect_tokens(tokenizer.token_stream("Hello世界2024")); + + // Should create separate segments for continuous letter/digit sequences + assert!(tokens.iter().any(|t| t.text == "He")); + assert!(tokens.iter().any(|t| t.text == "世界")); + assert!(tokens.iter().any(|t| t.text == "20")); + } + + #[test] + fn test_prefix_only_mode() { + // Test prefix_only mode for autocomplete use case + let config = TokenCharsConfig::with_all(); + let mut tokenizer = NgramTokenizerWithChars::new(2, 5, true, config).unwrap(); + let tokens = collect_tokens(tokenizer.token_stream("search")); + + // Should only generate ngrams starting from position 0 + assert_eq!(tokens.len(), 4); + assert!(tokens.iter().any(|t| t.text == "se")); + assert!(tokens.iter().any(|t| t.text == "sea")); + assert!(tokens.iter().any(|t| t.text == "sear")); + assert!(tokens.iter().any(|t| t.text == "searc")); + + // Should NOT have ngrams starting from other positions + assert!(!tokens.iter().any(|t| t.text == "ea")); + assert!(!tokens.iter().any(|t| t.text == "ar")); + } + + #[test] + fn test_empty_input() { + // Test empty string handling + let mut tokenizer = create_alphanumeric_tokenizer(2, 3, false); + let tokens = collect_tokens(tokenizer.token_stream("")); + assert!(tokens.is_empty()); + } + + #[test] + fn test_single_char_below_min_gram() { + // Test input shorter than min_gram + let mut tokenizer = create_alphanumeric_tokenizer(2, 3, false); + + // Single character - should produce no tokens + let tokens = collect_tokens(tokenizer.token_stream("a")); + assert!(tokens.is_empty()); + + // Also test with non-ASCII single character + let tokens = collect_tokens(tokenizer.token_stream("中")); + assert!(tokens.is_empty()); + } + + #[test] + fn test_only_excluded_chars() { + // Test input with only excluded characters + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Letter); + let config = TokenCharsConfig { + token_chars, + custom_token_chars: String::new(), + }; + + let mut tokenizer = NgramTokenizerWithChars::new(2, 3, false, config).unwrap(); + let tokens = collect_tokens(tokenizer.token_stream("123!@#")); + assert!(tokens.is_empty()); + } + + #[test] + fn test_whitespace_handling() { + // Test whitespace as token chars + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Whitespace); + let config = TokenCharsConfig { + token_chars, + custom_token_chars: String::new(), + }; + + let mut tokenizer = NgramTokenizerWithChars::new(2, 3, false, config).unwrap(); + let tokens = collect_tokens(tokenizer.token_stream(" \t\n")); + + // Should generate ngrams from whitespace + assert!(!tokens.is_empty()); + assert!(tokens.iter().any(|t| t.text == " ")); + } + + #[test] + fn test_position_field() { + // Test that position field is set correctly for segments + let mut tokenizer = create_alphanumeric_tokenizer(2, 2, false); + let tokens = collect_tokens(tokenizer.token_stream("ab-cd-ef")); + + // Each segment should have incrementing position + let ab_tokens: Vec<_> = tokens.iter().filter(|t| t.text.contains("ab")).collect(); + let cd_tokens: Vec<_> = tokens.iter().filter(|t| t.text.contains("cd")).collect(); + let ef_tokens: Vec<_> = tokens.iter().filter(|t| t.text.contains("ef")).collect(); + + assert!(ab_tokens.iter().all(|t| t.position == 0)); + assert!(cd_tokens.iter().all(|t| t.position == 1)); + assert!(ef_tokens.iter().all(|t| t.position == 2)); + } + + #[test] + fn test_min_max_gram_boundaries() { + // Test min_gram = max_gram with only letters + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Letter); + let config = TokenCharsConfig { + token_chars, + custom_token_chars: String::new(), + }; + + let mut tokenizer = NgramTokenizerWithChars::new(3, 3, false, config).unwrap(); + let tokens = collect_tokens(tokenizer.token_stream("hello")); + + // All tokens should be exactly 3 characters + // Debug: print tokens if test fails + for token in &tokens { + if token.text.chars().count() != 3 { + eprintln!( + "Unexpected token: '{}' (len={})", + token.text, + token.text.chars().count() + ); + } + } + assert_eq!(tokens.len(), 3); // hel, ell, llo + assert!(tokens.iter().all(|t| t.text.chars().count() == 3)); + assert!(tokens.iter().any(|t| t.text == "hel")); + assert!(tokens.iter().any(|t| t.text == "ell")); + assert!(tokens.iter().any(|t| t.text == "llo")); + } + + #[test] + fn test_large_text() { + // Test performance with large text + let large_text = "a".repeat(1000); + let mut tokenizer = create_alphanumeric_tokenizer(2, 3, false); + let tokens = collect_tokens(tokenizer.token_stream(&large_text)); + + // Should generate correct number of ngrams + // For text of length n (1000) with min=2, max=3: + // Each position i (0 <= i <= 997) generates 2 tokens: 2-gram and 3-gram + // Position 998 generates 1 token: 2-gram only + // Position 999 generates 0 tokens (can't form 2-gram) + // Total: 998 * 2 + 1 = 1997 + assert_eq!(tokens.len(), 1997); + } + + #[test] + fn test_emoji_as_symbols() { + // Test that emojis are correctly classified as symbols + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Symbol); + let config = TokenCharsConfig { + token_chars, + custom_token_chars: String::new(), + }; + + let mut tokenizer = NgramTokenizerWithChars::new(1, 2, false, config).unwrap(); + let tokens = collect_tokens(tokenizer.token_stream("😀🚀")); + + // Should tokenize emojis as symbols + assert!(!tokens.is_empty()); + assert!(tokens.iter().any(|t| t.text == "😀")); + assert!(tokens.iter().any(|t| t.text == "🚀")); + assert!(tokens.iter().any(|t| t.text == "😀🚀")); + } + + #[test] + fn test_error_handling() { + // Test invalid configurations + assert!(NgramTokenizerWithChars::new(0, 3, false, TokenCharsConfig::default()).is_err()); + assert!(NgramTokenizerWithChars::new(5, 3, false, TokenCharsConfig::default()).is_err()); + } + + #[test] + fn test_multiple_segments_with_custom_chars() { + // Test complex scenario with multiple character types + let mut token_chars = HashSet::new(); + token_chars.insert(TokenCharType::Letter); + token_chars.insert(TokenCharType::Digit); + token_chars.insert(TokenCharType::Custom); + + let config = TokenCharsConfig { + token_chars, + custom_token_chars: "@.".to_string(), + }; + + let mut tokenizer = NgramTokenizerWithChars::new(2, 3, false, config).unwrap(); + let tokens = collect_tokens(tokenizer.token_stream("user@example.com")); + + // Should keep @ and . as part of tokens + assert!(tokens.iter().any(|t| t.text == "r@e")); + assert!(tokens.iter().any(|t| t.text == "e.c")); + assert!(tokens.iter().any(|t| t.text.contains('@'))); + assert!(tokens.iter().any(|t| t.text.contains('.'))); + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs index b508763b27..681ef66e86 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs @@ -1,4 +1,5 @@ use crate::error::Result; +use core::slice; use std::collections::HashSet; use std::ffi::CStr; use std::ffi::{c_char, c_void};