/* * * IMPORTANT NOTICE: * This file has been copied from Quickwit, an open source project, and is subject to the terms * and conditions of the GNU Affero General Public License (AGPL) version 3.0. * Please review the full licensing details at . * By using this file, you agree to comply with the AGPL v3.0 terms. * */ use lindera::dictionary::DictionaryKind; use lindera::mode::Mode; use lindera::token::Token as LinderaToken; use lindera::tokenizer::Tokenizer as LinderaTokenizer; use once_cell::sync::Lazy; use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; static CMN_TOKENIZER: Lazy = Lazy::new(|| { let dictionary = lindera::dictionary::load_dictionary_from_kind(DictionaryKind::CcCedict) .expect("Lindera `CcCedict` dictionary must be present"); LinderaTokenizer::new(lindera::segmenter::Segmenter::new( Mode::Normal, dictionary, None, )) }); static JPN_TOKENIZER: Lazy = Lazy::new(|| { let dictionary = lindera::dictionary::load_dictionary_from_kind(DictionaryKind::IPADIC) .expect("Lindera `IPADIC` dictionary must be present"); LinderaTokenizer::new(lindera::segmenter::Segmenter::new( Mode::Normal, dictionary, None, )) }); static KOR_TOKENIZER: Lazy = Lazy::new(|| { let dictionary = lindera::dictionary::load_dictionary_from_kind(DictionaryKind::KoDic) .expect("Lindera `KoDic` dictionary must be present"); LinderaTokenizer::new(lindera::segmenter::Segmenter::new( Mode::Normal, dictionary, None, )) }); #[derive(Clone, Default)] pub struct LinderaChineseTokenizer { token: Token, } #[derive(Clone, Default)] pub struct LinderaJapaneseTokenizer { token: Token, } #[derive(Clone, Default)] pub struct LinderaKoreanTokenizer { token: Token, } impl Tokenizer for LinderaChineseTokenizer { type TokenStream<'a> = MultiLanguageTokenStream<'a>; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { if text.trim().is_empty() { return MultiLanguageTokenStream::Empty; } let lindera_token_stream = LinderaTokenStream { tokens: CMN_TOKENIZER .tokenize(text) .expect("Lindera Chinese tokenizer failed"), token: &mut self.token, }; MultiLanguageTokenStream::Lindera(lindera_token_stream) } } impl Tokenizer for LinderaJapaneseTokenizer { type TokenStream<'a> = MultiLanguageTokenStream<'a>; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { if text.trim().is_empty() { return MultiLanguageTokenStream::Empty; } let lindera_token_stream = LinderaTokenStream { tokens: JPN_TOKENIZER .tokenize(text) .expect("Lindera Japanese tokenizer failed"), token: &mut self.token, }; MultiLanguageTokenStream::Lindera(lindera_token_stream) } } impl Tokenizer for LinderaKoreanTokenizer { type TokenStream<'a> = MultiLanguageTokenStream<'a>; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { if text.trim().is_empty() { return MultiLanguageTokenStream::Empty; } let lindera_token_stream = LinderaTokenStream { tokens: KOR_TOKENIZER .tokenize(text) .expect("Lindera Korean tokenizer failed"), token: &mut self.token, }; MultiLanguageTokenStream::Lindera(lindera_token_stream) } } pub enum MultiLanguageTokenStream<'a> { Empty, Lindera(LinderaTokenStream<'a>), } pub struct LinderaTokenStream<'a> { pub tokens: Vec>, pub token: &'a mut Token, } impl TokenStream for MultiLanguageTokenStream<'_> { fn advance(&mut self) -> bool { match self { MultiLanguageTokenStream::Empty => false, MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.advance(), } } fn token(&self) -> &Token { match self { MultiLanguageTokenStream::Empty => { panic!("Cannot call token() on an empty token stream.") } MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token(), } } fn token_mut(&mut self) -> &mut Token { match self { MultiLanguageTokenStream::Empty => { panic!("Cannot call token_mut() on an empty token stream.") } MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token_mut(), } } } impl TokenStream for LinderaTokenStream<'_> { fn advance(&mut self) -> bool { if self.tokens.is_empty() { return false; } let token = self.tokens.remove(0); self.token.text = token.text.to_string(); self.token.offset_from = token.byte_start; self.token.offset_to = token.byte_end; self.token.position = token.position; self.token.position_length = token.position_length; true } fn token(&self) -> &Token { self.token } fn token_mut(&mut self) -> &mut Token { self.token } } #[cfg(test)] mod tests { use super::*; use rstest::*; use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; fn test_helper(tokenizer: &mut T, text: &str) -> Vec { let mut token_stream = tokenizer.token_stream(text); let mut tokens: Vec = vec![]; while token_stream.advance() { tokens.push(token_stream.token().clone()); } tokens } #[rstest] fn test_lindera_chinese_tokenizer() { let mut tokenizer = LinderaChineseTokenizer::default(); let tokens = test_helper( &mut tokenizer, "地址1,包含無效的字元 (包括符號與不標準的asci阿爾發字元", ); assert_eq!(tokens.len(), 19); { let token = &tokens[0]; assert_eq!(token.text, "地址"); assert_eq!(token.offset_from, 0); assert_eq!(token.offset_to, 6); assert_eq!(token.position, 0); assert_eq!(token.position_length, 1); } } #[rstest] fn test_japanese_tokenizer() { let mut tokenizer = LinderaJapaneseTokenizer::default(); { let tokens = test_helper(&mut tokenizer, "すもももももももものうち"); assert_eq!(tokens.len(), 7); { let token = &tokens[0]; assert_eq!(token.text, "すもも"); assert_eq!(token.offset_from, 0); assert_eq!(token.offset_to, 9); assert_eq!(token.position, 0); assert_eq!(token.position_length, 1); } } } #[rstest] fn test_korean_tokenizer() { let mut tokenizer = LinderaKoreanTokenizer::default(); { let tokens = test_helper(&mut tokenizer, "일본입니다. 매우 멋진 단어입니다."); assert_eq!(tokens.len(), 11); { let token = &tokens[0]; assert_eq!(token.text, "일본"); assert_eq!(token.offset_from, 0); assert_eq!(token.offset_to, 6); assert_eq!(token.position, 0); assert_eq!(token.position_length, 1); } } } #[rstest] fn test_lindera_chinese_tokenizer_with_empty_string() { let mut tokenizer = LinderaChineseTokenizer::default(); { let tokens = test_helper(&mut tokenizer, ""); assert_eq!(tokens.len(), 0); } { let tokens = test_helper(&mut tokenizer, " "); assert_eq!(tokens.len(), 0); } } #[rstest] fn test_japanese_tokenizer_with_empty_string() { let mut tokenizer = LinderaJapaneseTokenizer::default(); { let tokens = test_helper(&mut tokenizer, ""); assert_eq!(tokens.len(), 0); } { let tokens = test_helper(&mut tokenizer, " "); assert_eq!(tokens.len(), 0); } } #[rstest] fn test_korean_tokenizer_with_empty_string() { let mut tokenizer = LinderaKoreanTokenizer::default(); { let tokens = test_helper(&mut tokenizer, ""); assert_eq!(tokens.len(), 0); } { let tokens = test_helper(&mut tokenizer, " "); assert_eq!(tokens.len(), 0); } } }