/*
*
* IMPORTANT NOTICE:
* This file has been copied from Quickwit, an open source project, and is subject to the terms
* and conditions of the GNU Affero General Public License (AGPL) version 3.0.
* Please review the full licensing details at .
* By using this file, you agree to comply with the AGPL v3.0 terms.
*
*/
use lindera::dictionary::DictionaryKind;
use lindera::mode::Mode;
use lindera::token::Token as LinderaToken;
use lindera::tokenizer::Tokenizer as LinderaTokenizer;
use once_cell::sync::Lazy;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
static CMN_TOKENIZER: Lazy = Lazy::new(|| {
let dictionary = lindera::dictionary::load_dictionary_from_kind(DictionaryKind::CcCedict)
.expect("Lindera `CcCedict` dictionary must be present");
LinderaTokenizer::new(lindera::segmenter::Segmenter::new(
Mode::Normal,
dictionary,
None,
))
});
static JPN_TOKENIZER: Lazy = Lazy::new(|| {
let dictionary = lindera::dictionary::load_dictionary_from_kind(DictionaryKind::IPADIC)
.expect("Lindera `IPADIC` dictionary must be present");
LinderaTokenizer::new(lindera::segmenter::Segmenter::new(
Mode::Normal,
dictionary,
None,
))
});
static KOR_TOKENIZER: Lazy = Lazy::new(|| {
let dictionary = lindera::dictionary::load_dictionary_from_kind(DictionaryKind::KoDic)
.expect("Lindera `KoDic` dictionary must be present");
LinderaTokenizer::new(lindera::segmenter::Segmenter::new(
Mode::Normal,
dictionary,
None,
))
});
#[derive(Clone, Default)]
pub struct LinderaChineseTokenizer {
token: Token,
}
#[derive(Clone, Default)]
pub struct LinderaJapaneseTokenizer {
token: Token,
}
#[derive(Clone, Default)]
pub struct LinderaKoreanTokenizer {
token: Token,
}
impl Tokenizer for LinderaChineseTokenizer {
type TokenStream<'a> = MultiLanguageTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
if text.trim().is_empty() {
return MultiLanguageTokenStream::Empty;
}
let lindera_token_stream = LinderaTokenStream {
tokens: CMN_TOKENIZER
.tokenize(text)
.expect("Lindera Chinese tokenizer failed"),
token: &mut self.token,
};
MultiLanguageTokenStream::Lindera(lindera_token_stream)
}
}
impl Tokenizer for LinderaJapaneseTokenizer {
type TokenStream<'a> = MultiLanguageTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
if text.trim().is_empty() {
return MultiLanguageTokenStream::Empty;
}
let lindera_token_stream = LinderaTokenStream {
tokens: JPN_TOKENIZER
.tokenize(text)
.expect("Lindera Japanese tokenizer failed"),
token: &mut self.token,
};
MultiLanguageTokenStream::Lindera(lindera_token_stream)
}
}
impl Tokenizer for LinderaKoreanTokenizer {
type TokenStream<'a> = MultiLanguageTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
if text.trim().is_empty() {
return MultiLanguageTokenStream::Empty;
}
let lindera_token_stream = LinderaTokenStream {
tokens: KOR_TOKENIZER
.tokenize(text)
.expect("Lindera Korean tokenizer failed"),
token: &mut self.token,
};
MultiLanguageTokenStream::Lindera(lindera_token_stream)
}
}
pub enum MultiLanguageTokenStream<'a> {
Empty,
Lindera(LinderaTokenStream<'a>),
}
pub struct LinderaTokenStream<'a> {
pub tokens: Vec>,
pub token: &'a mut Token,
}
impl TokenStream for MultiLanguageTokenStream<'_> {
fn advance(&mut self) -> bool {
match self {
MultiLanguageTokenStream::Empty => false,
MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.advance(),
}
}
fn token(&self) -> &Token {
match self {
MultiLanguageTokenStream::Empty => {
panic!("Cannot call token() on an empty token stream.")
}
MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token(),
}
}
fn token_mut(&mut self) -> &mut Token {
match self {
MultiLanguageTokenStream::Empty => {
panic!("Cannot call token_mut() on an empty token stream.")
}
MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token_mut(),
}
}
}
impl TokenStream for LinderaTokenStream<'_> {
fn advance(&mut self) -> bool {
if self.tokens.is_empty() {
return false;
}
let token = self.tokens.remove(0);
self.token.text = token.text.to_string();
self.token.offset_from = token.byte_start;
self.token.offset_to = token.byte_end;
self.token.position = token.position;
self.token.position_length = token.position_length;
true
}
fn token(&self) -> &Token {
self.token
}
fn token_mut(&mut self) -> &mut Token {
self.token
}
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::*;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
fn test_helper(tokenizer: &mut T, text: &str) -> Vec {
let mut token_stream = tokenizer.token_stream(text);
let mut tokens: Vec = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());
}
tokens
}
#[rstest]
fn test_lindera_chinese_tokenizer() {
let mut tokenizer = LinderaChineseTokenizer::default();
let tokens = test_helper(
&mut tokenizer,
"地址1,包含無效的字元 (包括符號與不標準的asci阿爾發字元",
);
assert_eq!(tokens.len(), 19);
{
let token = &tokens[0];
assert_eq!(token.text, "地址");
assert_eq!(token.offset_from, 0);
assert_eq!(token.offset_to, 6);
assert_eq!(token.position, 0);
assert_eq!(token.position_length, 1);
}
}
#[rstest]
fn test_japanese_tokenizer() {
let mut tokenizer = LinderaJapaneseTokenizer::default();
{
let tokens = test_helper(&mut tokenizer, "すもももももももものうち");
assert_eq!(tokens.len(), 7);
{
let token = &tokens[0];
assert_eq!(token.text, "すもも");
assert_eq!(token.offset_from, 0);
assert_eq!(token.offset_to, 9);
assert_eq!(token.position, 0);
assert_eq!(token.position_length, 1);
}
}
}
#[rstest]
fn test_korean_tokenizer() {
let mut tokenizer = LinderaKoreanTokenizer::default();
{
let tokens = test_helper(&mut tokenizer, "일본입니다. 매우 멋진 단어입니다.");
assert_eq!(tokens.len(), 11);
{
let token = &tokens[0];
assert_eq!(token.text, "일본");
assert_eq!(token.offset_from, 0);
assert_eq!(token.offset_to, 6);
assert_eq!(token.position, 0);
assert_eq!(token.position_length, 1);
}
}
}
#[rstest]
fn test_lindera_chinese_tokenizer_with_empty_string() {
let mut tokenizer = LinderaChineseTokenizer::default();
{
let tokens = test_helper(&mut tokenizer, "");
assert_eq!(tokens.len(), 0);
}
{
let tokens = test_helper(&mut tokenizer, " ");
assert_eq!(tokens.len(), 0);
}
}
#[rstest]
fn test_japanese_tokenizer_with_empty_string() {
let mut tokenizer = LinderaJapaneseTokenizer::default();
{
let tokens = test_helper(&mut tokenizer, "");
assert_eq!(tokens.len(), 0);
}
{
let tokens = test_helper(&mut tokenizer, " ");
assert_eq!(tokens.len(), 0);
}
}
#[rstest]
fn test_korean_tokenizer_with_empty_string() {
let mut tokenizer = LinderaKoreanTokenizer::default();
{
let tokens = test_helper(&mut tokenizer, "");
assert_eq!(tokens.len(), 0);
}
{
let tokens = test_helper(&mut tokenizer, " ");
assert_eq!(tokens.len(), 0);
}
}
}