// Copyright (c) 2023-2026 ParadeDB, Inc.
//
// This file is part of ParadeDB - Postgres for Search and Analytics
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
#![allow(deprecated)]
use std::fmt::Write;
#[cfg(feature = "icu")]
use crate::icu::ICUTokenizer;
use crate::ngram::NgramTokenizer;
use crate::{
cjk::ChineseTokenizer,
code::CodeTokenizer,
jieba::JiebaTokenizer,
lindera::{LinderaChineseTokenizer, LinderaJapaneseTokenizer, LinderaKoreanTokenizer},
token_length::TokenLengthFilter,
token_trim::TokenTrimFilter,
unicode_words::UnicodeWordsTokenizer,
};
use crate::chinese_convert::{ChineseConvertTokenizer, ConvertMode};
use anyhow::Result;
use once_cell::sync::Lazy;
use serde::de::{self, Deserializer};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use strum::AsRefStr;
use tantivy::tokenizer::{
AlphaNumOnlyFilter, AsciiFoldingFilter, Language, LowerCaser, RawTokenizer, RegexTokenizer,
SimpleTokenizer, Stemmer, StopWordFilter, TextAnalyzer, WhitespaceTokenizer,
};
#[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq, Eq)]
pub struct SearchTokenizerFilters {
pub remove_short: Option,
pub remove_long: Option,
pub lowercase: Option,
pub stemmer: Option,
pub stopwords_language: Option,
pub stopwords: Option>,
pub alpha_num_only: Option,
pub ascii_folding: Option,
pub trim: Option,
pub normalizer: Option,
}
impl SearchTokenizerFilters {
/// Returns a [`SearchTokenizerFilter`] instance that effectively does not filter, or otherwise
/// mutate tokens.
///
/// This should be used for declaring the "key field" in an index. It can be used for other
/// text types that don't want tokenization too.
pub const fn keyword() -> &'static Self {
&SearchTokenizerFilters {
remove_short: None,
remove_long: None,
lowercase: Some(false),
stemmer: None,
stopwords_language: None,
stopwords: None,
ascii_folding: None,
alpha_num_only: None,
trim: None,
normalizer: Some(SearchNormalizer::Raw),
}
}
pub const fn keyword_deprecated() -> &'static Self {
&SearchTokenizerFilters {
remove_short: None,
remove_long: Some(usize::MAX),
lowercase: Some(false),
stemmer: None,
stopwords_language: None,
stopwords: None,
ascii_folding: None,
alpha_num_only: None,
trim: None,
normalizer: Some(SearchNormalizer::Raw),
}
}
fn from_json_value(value: &serde_json::Value) -> Result {
let mut filters = SearchTokenizerFilters::default();
if let Some(remove_long) = value.get("remove_long") {
filters.remove_long = Some(remove_long.as_u64().ok_or_else(|| {
anyhow::anyhow!(
"a 'remove_long' value passed to the pg_search tokenizer configuration \
must be of type u64, found: {remove_long:#?}"
)
})? as usize);
}
if let Some(remove_short) = value.get("remove_short") {
filters.remove_short = Some(remove_short.as_u64().ok_or_else(|| {
anyhow::anyhow!(
"a 'remove_short' value passed to the pg_search tokenizer configuration \
must be of type u64, found: {remove_short:#?}"
)
})? as usize);
}
if let Some(lowercase) = value.get("lowercase") {
filters.lowercase = Some(lowercase.as_bool().ok_or_else(|| {
anyhow::anyhow!(
"a 'lowercase' value passed to the pg_search tokenizer configuration \
must be of type bool, found: {lowercase:#?}"
)
})?);
};
if let Some(stemmer) = value.get("stemmer") {
filters.stemmer = Some(serde_json::from_value(stemmer.clone()).map_err(|_| {
anyhow::anyhow!("stemmer tokenizer requires a valid 'stemmer' field")
})?);
}
if let Some(stopwords_language) = value.get("stopwords_language") {
filters.stopwords_language = Some(
serde_json::from_value(stopwords_language.clone()).map_err(|e| {
anyhow::anyhow!(
"stopwords_language tokenizer requires a valid 'stopwords_language' field: {e}"
)
})?,
);
}
if let Some(stopwords) = value.get("stopwords") {
filters.stopwords = Some(serde_json::from_value(stopwords.clone()).map_err(|_| {
anyhow::anyhow!("stopwords tokenizer requires a valid 'stopwords' field")
})?);
}
if let Some(alpha_num_only) = value.get("alpha_num_only") {
filters.alpha_num_only = Some(alpha_num_only.as_bool().ok_or_else(|| {
anyhow::anyhow!("ascii_folding tokenizer requires a valid 'alpha_num_only' field")
})?);
}
if let Some(ascii_folding) = value.get("ascii_folding") {
filters.ascii_folding = Some(ascii_folding.as_bool().ok_or_else(|| {
anyhow::anyhow!("ascii_folding tokenizer requires a valid 'ascii_folding' field")
})?);
}
if let Some(trim) = value.get("trim") {
filters.trim = Some(trim.as_bool().ok_or_else(|| {
anyhow::anyhow!(
"a 'trim' value passed to the pg_search tokenizer configuration \
must be of type bool, found: {trim:#?}"
)
})?);
}
Ok(filters)
}
fn name_suffix(&self) -> String {
let mut buffer = String::new();
let mut is_empty = true;
fn sep(is_empty: bool) -> &'static str {
if is_empty {
""
} else {
","
}
}
if let Some(value) = self.remove_short {
write!(buffer, "{}remove_short={value}", sep(is_empty))
.expect("Writing to String buffer should never fail");
is_empty = false;
}
if let Some(value) = self.remove_long {
write!(buffer, "{}remove_long={value}", sep(is_empty))
.expect("Writing to String buffer should never fail");
is_empty = false;
}
if let Some(value) = self.lowercase {
write!(buffer, "{}lowercase={value}", sep(is_empty))
.expect("Writing to String buffer should never fail");
is_empty = false;
}
if let Some(value) = self.stemmer {
write!(buffer, "{}stemmer={value:?}", sep(is_empty)).unwrap();
is_empty = false;
}
if let Some(value) = self.stopwords_language.as_ref() {
write!(buffer, "{}stopwords_language={value:?}", sep(is_empty)).unwrap();
is_empty = false;
}
if let Some(value) = self.stopwords.as_ref() {
write!(buffer, "{}stopwords={value:?}", sep(is_empty)).unwrap();
is_empty = false;
}
if let Some(value) = self.alpha_num_only {
write!(buffer, "{}alpha_num_only={value}", sep(is_empty)).unwrap();
is_empty = false;
}
if let Some(value) = self.ascii_folding {
write!(buffer, "{}ascii_folding={value}", sep(is_empty)).unwrap();
is_empty = false;
}
if is_empty {
"".into()
} else {
format!("[{buffer}]")
}
}
fn token_length_filter(&self) -> Option {
match (self.remove_short, self.remove_long) {
(None, None) => None,
(remove_short, remove_long) => Some(TokenLengthFilter::new(remove_short, remove_long)),
}
}
fn lower_caser(&self) -> Option {
match self.lowercase {
Some(false) => None, // Only disable if explicitly requested.
_ => Some(LowerCaser),
}
}
fn stemmer(&self) -> Option {
self.stemmer.map(Stemmer::new)
}
fn stopwords_language(&self) -> Option {
match self.stopwords_language {
Some(language) => StopWordFilter::new(language),
None => None,
}
}
fn stopwords(&self) -> Option {
self.stopwords
.as_ref()
.map(|stopwords| StopWordFilter::remove(stopwords.clone()))
}
fn alpha_num_only(&self) -> Option {
match self.alpha_num_only {
Some(true) => Some(AlphaNumOnlyFilter), // Only enable if explicitly requested.
_ => None,
}
}
fn ascii_folding(&self) -> Option {
match self.ascii_folding {
Some(true) => Some(AsciiFoldingFilter), // Only enable if explicitly requested.
_ => None,
}
}
fn trim_filter(&self) -> Option {
match self.trim {
Some(true) => Some(TokenTrimFilter::new()), // Only enable if explicitly requested.
_ => None,
}
}
fn normalizer(&self) -> Option {
self.normalizer
}
}
macro_rules! add_filters {
($tokenizer:expr, $filters:expr $(, $extra_filter:expr )* $(,)?) => {{
tantivy::tokenizer::TextAnalyzer::builder($tokenizer)
.filter($filters.token_length_filter())
.filter($filters.trim_filter())
.filter($filters.lower_caser())
.filter($filters.stemmer())
.filter($filters.stopwords_language())
.filter($filters.stopwords())
.filter($filters.ascii_folding())
$(
.filter($extra_filter)
)*
.filter($filters.alpha_num_only())
.build()
}};
}
// Serde will pick a SearchTokenizer variant based on the value of the
// "type" key, which needs to match one of the variant names below.
// The "type" field will not be present on the deserialized value.
//
// Ensure that new variants are added to `from_json_value`. We don't use serde_json to ser/de the
// SearchTokenizer, because our bincode serialization format is incompatible
// with the "tagged" format we use in our public API.
#[derive(Serialize, Clone, Debug, PartialEq, Eq, strum_macros::VariantNames, AsRefStr)]
#[strum(serialize_all = "snake_case")]
pub enum SearchTokenizer {
#[strum(serialize = "default")]
Simple(SearchTokenizerFilters),
Keyword,
#[deprecated(
since = "0.19.0",
note = "use the `SearchTokenizer::Keyword` variant instead"
)]
KeywordDeprecated,
#[deprecated(
since = "0.15.17",
note = "use the `SearchTokenizer::Keyword` variant instead"
)]
Raw(SearchTokenizerFilters),
LiteralNormalized(SearchTokenizerFilters),
WhiteSpace(SearchTokenizerFilters),
RegexTokenizer {
pattern: String,
filters: SearchTokenizerFilters,
},
ChineseCompatible(SearchTokenizerFilters),
SourceCode(SearchTokenizerFilters),
Ngram {
min_gram: usize,
max_gram: usize,
prefix_only: bool,
#[serde(default)]
positions: bool,
filters: SearchTokenizerFilters,
},
ChineseLindera(SearchTokenizerFilters),
JapaneseLindera(SearchTokenizerFilters),
KoreanLindera(SearchTokenizerFilters),
#[cfg(feature = "icu")]
#[strum(serialize = "icu")]
ICUTokenizer(SearchTokenizerFilters),
Jieba {
chinese_convert: Option,
filters: SearchTokenizerFilters,
},
Lindera(LinderaLanguage, SearchTokenizerFilters),
UnicodeWordsDeprecated {
remove_emojis: bool,
filters: SearchTokenizerFilters,
},
UnicodeWords {
remove_emojis: bool,
filters: SearchTokenizerFilters,
},
}
#[derive(Default, Serialize, Clone, Debug, PartialEq, Eq, strum_macros::VariantNames, AsRefStr)]
pub enum LinderaLanguage {
#[default]
Unspecified,
Chinese,
Japanese,
Korean,
}
impl Default for SearchTokenizer {
fn default() -> Self {
Self::UnicodeWords {
remove_emojis: false,
filters: SearchTokenizerFilters::default(),
}
}
}
impl SearchTokenizer {
pub fn from_json_value(value: &serde_json::Value) -> Result {
// We use the `type` field of a JSON object to distinguish the tokenizer variant.
// Deserialized in this "tagged enum" fashion is not supported by bincode, which
// we use elsewhere for serialization, so we manually parse the JSON object here.
let tokenizer_type = value["type"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("a 'type' must be passed in pg_search tokenizer configuration, not found in: {value:#?}"))?;
let filters = SearchTokenizerFilters::from_json_value(value)?;
match tokenizer_type {
"default" => Ok(SearchTokenizer::Simple(filters)),
"keyword" => Ok(SearchTokenizer::Keyword),
#[allow(deprecated)]
"raw" => Ok(SearchTokenizer::Raw(filters)),
"literal_normalized" => Ok(SearchTokenizer::LiteralNormalized(filters)),
"whitespace" => Ok(SearchTokenizer::WhiteSpace(filters)),
"regex" => {
let pattern: String =
serde_json::from_value(value["pattern"].clone()).map_err(|_| {
anyhow::anyhow!("regex tokenizer requires a string 'pattern' field")
})?;
Ok(SearchTokenizer::RegexTokenizer { pattern, filters })
}
"chinese_compatible" => Ok(SearchTokenizer::ChineseCompatible(filters)),
"source_code" => Ok(SearchTokenizer::SourceCode(filters)),
"ngram" => {
let min_gram: usize =
serde_json::from_value(value["min_gram"].clone()).map_err(|_| {
anyhow::anyhow!("ngram tokenizer requires an integer 'min_gram' field")
})?;
let max_gram: usize =
serde_json::from_value(value["max_gram"].clone()).map_err(|_| {
anyhow::anyhow!("ngram tokenizer requires an integer 'max_gram' field")
})?;
let prefix_only: bool = serde_json::from_value(value["prefix_only"].clone())
.map_err(|_| {
anyhow::anyhow!("ngram tokenizer requires a boolean 'prefix_only' field")
})?;
let positions: bool = value
.get("positions")
.and_then(|v| v.as_bool())
.unwrap_or(false);
Ok(SearchTokenizer::Ngram {
min_gram,
max_gram,
prefix_only,
positions,
filters,
})
}
"chinese_lindera" => Ok(SearchTokenizer::ChineseLindera(filters)),
"japanese_lindera" => Ok(SearchTokenizer::JapaneseLindera(filters)),
"korean_lindera" => Ok(SearchTokenizer::KoreanLindera(filters)),
#[cfg(feature = "icu")]
"icu" => Ok(SearchTokenizer::ICUTokenizer(filters)),
"jieba" => {
let chinese_convert: Option = if value["chinese_convert"].is_null() {
None
} else {
Some(
serde_json::from_value(value["chinese_convert"].clone()).map_err(|_| {
anyhow::anyhow!(
"jieba tokenizer requires a string 'chinese_convert' field"
)
})?,
)
};
Ok(SearchTokenizer::Jieba {
chinese_convert,
filters,
})
}
"unicode_words" => {
let remove_emojis: bool = serde_json::from_value(value["remove_emojis"].clone())
.map_err(|_| {
anyhow::anyhow!(
"unicode_words tokenizer requires an integer 'remove_emojis' field"
)
})?;
Ok(SearchTokenizer::UnicodeWords {
remove_emojis,
filters,
})
}
_ => Err(anyhow::anyhow!(
"unknown tokenizer type: {}",
tokenizer_type
)),
}
}
pub fn to_tantivy_tokenizer(&self) -> Option {
let analyzer = match self {
SearchTokenizer::Simple(filters) => {
add_filters!(SimpleTokenizer::default(), filters)
}
// the keyword tokenizer is a special case that does not have filters
SearchTokenizer::Keyword => TextAnalyzer::builder(RawTokenizer::default()).build(),
#[allow(deprecated)]
SearchTokenizer::KeywordDeprecated => {
TextAnalyzer::builder(RawTokenizer::default()).build()
}
SearchTokenizer::LiteralNormalized(filters) => {
add_filters!(RawTokenizer::default(), filters)
}
SearchTokenizer::WhiteSpace(filters) => {
add_filters!(WhitespaceTokenizer::default(), filters)
}
// this Tokenizer is deprecated because it's bugged. `filters.lower_caser()` provides defaults, but that is the
// opposite of what the `raw` tokenizer should do.
//
// the decision was made to introduce the `keyword` tokenizer which does the correct thing
// that is, doesn't mutate the input tokens
#[allow(deprecated)]
SearchTokenizer::Raw(filters) => {
add_filters!(RawTokenizer::default(), filters)
}
SearchTokenizer::RegexTokenizer { pattern, filters } => {
add_filters!(RegexTokenizer::new(pattern.as_str()).unwrap(), filters)
}
SearchTokenizer::Ngram {
min_gram,
max_gram,
prefix_only,
positions,
filters,
} => add_filters!(
NgramTokenizer::new(*min_gram, *max_gram, *prefix_only, *positions)
.unwrap_or_else(|e| panic!("{}", e)),
filters
),
SearchTokenizer::ChineseCompatible(filters) => {
add_filters!(ChineseTokenizer, filters)
}
SearchTokenizer::SourceCode(filters) => {
// for backwards compatibility, the source_code tokenizer defaults to ascii_folding
// if it's not explicitly set
if filters.ascii_folding().is_none() {
add_filters!(CodeTokenizer::default(), filters, AsciiFoldingFilter)
} else {
add_filters!(CodeTokenizer::default(), filters)
}
}
SearchTokenizer::ChineseLindera(filters)
| SearchTokenizer::Lindera(LinderaLanguage::Chinese, filters) => {
add_filters!(LinderaChineseTokenizer::default(), filters)
}
SearchTokenizer::JapaneseLindera(filters)
| SearchTokenizer::Lindera(LinderaLanguage::Japanese, filters) => {
add_filters!(LinderaJapaneseTokenizer::default(), filters)
}
SearchTokenizer::KoreanLindera(filters)
| SearchTokenizer::Lindera(LinderaLanguage::Korean, filters) => {
add_filters!(LinderaKoreanTokenizer::default(), filters)
}
#[cfg(feature = "icu")]
SearchTokenizer::ICUTokenizer(filters) => {
add_filters!(ICUTokenizer, filters)
}
SearchTokenizer::Jieba {
chinese_convert,
filters,
} => {
// If Chinese conversion is configured, perform the conversion before tokenization
if let Some(convert_mode) = chinese_convert {
let base_tokenizer = JiebaTokenizer::new();
let convert_tokenizer =
ChineseConvertTokenizer::new(base_tokenizer, *convert_mode);
add_filters!(convert_tokenizer, filters)
} else {
add_filters!(JiebaTokenizer::new(), filters)
}
}
SearchTokenizer::Lindera(LinderaLanguage::Unspecified, _) => {
panic!("LinderaStyle::Unspecified is not supported")
}
SearchTokenizer::UnicodeWords {
remove_emojis,
filters,
}
| SearchTokenizer::UnicodeWordsDeprecated {
remove_emojis,
filters,
} => {
add_filters!(UnicodeWordsTokenizer::new(*remove_emojis), filters)
}
};
Some(analyzer)
}
fn filters(&self) -> &SearchTokenizerFilters {
match self {
SearchTokenizer::Simple(filters) => filters,
SearchTokenizer::Keyword => SearchTokenizerFilters::keyword(),
#[allow(deprecated)]
SearchTokenizer::KeywordDeprecated => SearchTokenizerFilters::keyword_deprecated(),
#[allow(deprecated)]
SearchTokenizer::Raw(filters) => filters,
SearchTokenizer::LiteralNormalized(filters) => filters,
SearchTokenizer::WhiteSpace(filters) => filters,
SearchTokenizer::RegexTokenizer { filters, .. } => filters,
SearchTokenizer::ChineseCompatible(filters) => filters,
SearchTokenizer::SourceCode(filters) => filters,
SearchTokenizer::Ngram { filters, .. } => filters,
SearchTokenizer::ChineseLindera(filters) => filters,
SearchTokenizer::JapaneseLindera(filters) => filters,
SearchTokenizer::KoreanLindera(filters) => filters,
SearchTokenizer::Lindera(_, filters) => filters,
#[cfg(feature = "icu")]
SearchTokenizer::ICUTokenizer(filters) => filters,
SearchTokenizer::Jieba { filters, .. } => filters,
SearchTokenizer::UnicodeWordsDeprecated { filters, .. } => filters,
SearchTokenizer::UnicodeWords { filters, .. } => filters,
}
}
pub fn normalizer(&self) -> Option {
self.filters().normalizer()
}
}
pub static LANGUAGES: Lazy> = Lazy::new(|| {
let mut map = HashMap::new();
map.insert(Language::Arabic, "Arabic");
map.insert(Language::Danish, "Danish");
map.insert(Language::Dutch, "Dutch");
map.insert(Language::English, "English");
map.insert(Language::Finnish, "Finnish");
map.insert(Language::French, "French");
map.insert(Language::German, "German");
map.insert(Language::Greek, "Greek");
map.insert(Language::Hungarian, "Hungarian");
map.insert(Language::Italian, "Italian");
map.insert(Language::Norwegian, "Norwegian");
map.insert(Language::Polish, "Polish");
map.insert(Language::Portuguese, "Portuguese");
map.insert(Language::Romanian, "Romanian");
map.insert(Language::Russian, "Russian");
map.insert(Language::Spanish, "Spanish");
map.insert(Language::Swedish, "Swedish");
map.insert(Language::Tamil, "Tamil");
map.insert(Language::Turkish, "Turkish");
map
});
impl SearchTokenizer {
pub fn name(&self) -> String {
let filters_suffix = self.filters().name_suffix();
match self {
SearchTokenizer::Simple(_filters) => format!("default{filters_suffix}"),
SearchTokenizer::Keyword => format!("keyword{filters_suffix}"),
#[allow(deprecated)]
SearchTokenizer::KeywordDeprecated => format!("keyword{filters_suffix}"),
#[allow(deprecated)]
SearchTokenizer::Raw(_filters) => format!("raw{filters_suffix}"),
SearchTokenizer::LiteralNormalized(_filters) => {
format!("literal_normalized{filters_suffix}")
}
SearchTokenizer::WhiteSpace(_filters) => format!("whitespace{filters_suffix}"),
SearchTokenizer::RegexTokenizer { .. } => format!("regex{filters_suffix}"),
SearchTokenizer::ChineseCompatible(_filters) => {
format!("chinese_compatible{filters_suffix}")
}
SearchTokenizer::SourceCode(_filters) => format!("source_code{filters_suffix}"),
SearchTokenizer::Ngram {
min_gram,
max_gram,
prefix_only,
filters: _,
positions,
} => {
let positions_suffix = if *positions { "_positions:true" } else { "" };
format!(
"ngram_mingram:{min_gram}_maxgram:{max_gram}_prefixonly:{prefix_only}{positions_suffix}{filters_suffix}"
)
}
SearchTokenizer::ChineseLindera(_filters) => format!("chinese_lindera{filters_suffix}"),
SearchTokenizer::JapaneseLindera(_filters) => {
format!("japanese_lindera{filters_suffix}")
}
SearchTokenizer::KoreanLindera(_filters) => format!("korean_lindera{filters_suffix}"),
SearchTokenizer::Lindera(style, _filters) => match style {
LinderaLanguage::Unspecified => {
panic!("LinderaStyle::Unspecified is not supported")
}
LinderaLanguage::Chinese => format!("chinese_lindera{filters_suffix}"),
LinderaLanguage::Japanese => format!("japanese_lindera{filters_suffix}"),
LinderaLanguage::Korean => format!("korean_lindera{filters_suffix}"),
},
#[cfg(feature = "icu")]
SearchTokenizer::ICUTokenizer(_filters) => format!("icu{filters_suffix}"),
SearchTokenizer::Jieba {
chinese_convert,
filters: _,
} => {
if let Some(chinese_convert) = chinese_convert {
format!("jieba{chinese_convert:?}{filters_suffix}")
} else {
format!("jieba{filters_suffix}")
}
}
SearchTokenizer::UnicodeWordsDeprecated {
remove_emojis,
filters: _,
} => format!("remove_emojis:{remove_emojis}{filters_suffix}"),
SearchTokenizer::UnicodeWords {
remove_emojis,
filters: _,
} => format!("unicode_words_removeemojis:{remove_emojis}{filters_suffix}"),
}
}
}
impl<'de> Deserialize<'de> for SearchTokenizer {
fn deserialize(deserializer: D) -> Result
where
D: Deserializer<'de>,
{
let value = serde_json::Value::deserialize(deserializer)?;
SearchTokenizer::from_json_value(&value).map_err(de::Error::custom)
}
}
// Normalizers for fast fields
#[derive(Default, Copy, Clone, Deserialize, Serialize, Debug, PartialEq, Eq)]
pub enum SearchNormalizer {
#[serde(rename = "raw")]
#[default]
Raw,
#[serde(rename = "lowercase")]
Lowercase,
}
impl SearchNormalizer {
pub fn name(&self) -> &str {
match self {
SearchNormalizer::Raw => "raw",
SearchNormalizer::Lowercase => "lowercase",
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::*;
#[rstest]
fn test_search_tokenizer() {
let tokenizer = SearchTokenizer::Simple(SearchTokenizerFilters::default());
assert_eq!(tokenizer.name(), "default".to_string());
let json = r#"{
"type": "ngram",
"min_gram": 20,
"max_gram": 60,
"prefix_only": true,
"remove_long": 123,
"lowercase": false
}"#;
let tokenizer =
SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap();
assert_eq!(
tokenizer,
SearchTokenizer::Ngram {
min_gram: 20,
max_gram: 60,
prefix_only: true,
positions: false,
filters: SearchTokenizerFilters {
remove_short: None,
remove_long: Some(123),
lowercase: Some(false),
stemmer: None,
stopwords_language: None,
stopwords: None,
ascii_folding: None,
trim: None,
normalizer: None,
alpha_num_only: None,
}
}
);
}
#[rstest]
fn test_regexizer() {
let json = r#"{
"type": "regex",
"pattern": "a+b*",
"remove_long": 100
}"#;
let tokenizer = SearchTokenizer::RegexTokenizer {
pattern: "a+b*".to_string(),
filters: SearchTokenizerFilters {
remove_short: None,
remove_long: Some(100),
lowercase: None,
stemmer: None,
stopwords_language: None,
stopwords: None,
ascii_folding: None,
trim: None,
normalizer: None,
alpha_num_only: None,
},
};
assert_eq!(
tokenizer,
SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap()
);
}
#[rstest]
fn test_search_normalizer() {
assert_eq!(SearchNormalizer::Lowercase.name(), "lowercase");
assert_ne!(SearchNormalizer::Raw, SearchNormalizer::Lowercase);
}
#[rstest]
fn test_jieba_tokenizer_with_stopwords() {
use tantivy::tokenizer::TokenStream;
// Test Jieba tokenizer with custom stopwords including spaces and content words
let json = r#"{
"type": "jieba",
"stopwords": [" ", "花朵", "公园"]
}"#;
let tokenizer =
SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap();
assert_eq!(
tokenizer,
SearchTokenizer::Jieba {
chinese_convert: None,
filters: SearchTokenizerFilters {
remove_short: None,
remove_long: None,
lowercase: None,
stemmer: None,
stopwords_language: None,
stopwords: Some(vec![
" ".to_string(),
"花朵".to_string(),
"公园".to_string()
]),
ascii_folding: None,
trim: None,
normalizer: None,
alpha_num_only: None,
}
}
);
// Test that the tokenizer is created successfully
let mut analyzer = tokenizer.to_tantivy_tokenizer().unwrap();
// Test tokenizing text with spaces and content words that should be filtered out
let text = "我们 昨天 在 公园 里 看到 了 很多 美丽 的 花朵";
let mut token_stream = analyzer.token_stream(text);
let mut tokens = Vec::new();
while token_stream.advance() {
let token = token_stream.token();
tokens.push(token.text.clone());
}
// Verify that custom stopwords are filtered out (spaces, 花朵, 公园)
assert!(!tokens.contains(&" ".to_string()));
assert!(!tokens.contains(&"花朵".to_string()));
assert!(!tokens.contains(&"公园".to_string()));
// Verify that other words are still present
assert!(tokens.contains(&"我们".to_string()));
assert!(tokens.contains(&"昨天".to_string()));
assert!(tokens.contains(&"美丽".to_string()));
}
#[rstest]
fn test_jieba_tokenizer_with_language_stopwords() {
use tantivy::tokenizer::{Language, TokenStream};
// Test Jieba tokenizer with language-based stopwords
let json = r#"{
"type": "jieba",
"stopwords_language": "English"
}"#;
let tokenizer =
SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap();
assert_eq!(
tokenizer,
SearchTokenizer::Jieba {
chinese_convert: None,
filters: SearchTokenizerFilters {
remove_short: None,
remove_long: None,
lowercase: None,
stemmer: None,
stopwords_language: Some(Language::English),
stopwords: None,
ascii_folding: None,
trim: None,
normalizer: None,
alpha_num_only: None,
}
}
);
// Test that the tokenizer is created successfully
let mut analyzer = tokenizer.to_tantivy_tokenizer().unwrap();
// Test tokenizing mixed Chinese and English text
let text = "我喜欢在 the library 里读书 and learning";
let mut token_stream = analyzer.token_stream(text);
let mut tokens = Vec::new();
while token_stream.advance() {
let token = token_stream.token();
tokens.push(token.text.clone());
}
// Verify that English stopwords "the", "and" are filtered out
assert!(!tokens.contains(&"the".to_string()));
assert!(!tokens.contains(&"and".to_string()));
// Verify that other words are still present
assert!(tokens.contains(&"library".to_string()));
assert!(tokens.contains(&"读书".to_string()));
assert!(tokens.contains(&"learning".to_string()));
}
#[rstest]
fn test_jieba_tokenizer_with_trim_filter() {
use tantivy::tokenizer::TokenStream;
// Test Jieba tokenizer with trim filter
let json = r#"{
"type": "jieba",
"trim": true
}"#;
let tokenizer =
SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap();
assert_eq!(
tokenizer,
SearchTokenizer::Jieba {
chinese_convert: None,
filters: SearchTokenizerFilters {
remove_short: None,
remove_long: None,
lowercase: None,
stemmer: None,
stopwords_language: None,
stopwords: None,
ascii_folding: None,
trim: Some(true),
normalizer: None,
alpha_num_only: None,
}
}
);
// Test that the tokenizer is created successfully
let mut analyzer = tokenizer.to_tantivy_tokenizer().unwrap();
// Test tokenizing text with spaces (which Jieba may produce as separate tokens)
let text = "富裕 劳动力";
let mut token_stream = analyzer.token_stream(text);
let mut tokens = Vec::new();
while token_stream.advance() {
let token = token_stream.token();
tokens.push(token.text.clone());
}
// Verify that space tokens are filtered out
assert!(!tokens.contains(&" ".to_string()));
assert!(!tokens.iter().any(|t| t.trim().is_empty()));
// Verify that content words are still present
assert!(tokens.contains(&"富裕".to_string()));
assert!(tokens.contains(&"劳动".to_string()) || tokens.contains(&"劳动力".to_string()));
}
#[rstest]
fn test_korean_lindera_tokenizer_with_trim_filter() {
use tantivy::tokenizer::TokenStream;
// Test Korean Lindera tokenizer with trim filter
let json = r#"{
"type": "korean_lindera",
"trim": true
}"#;
let tokenizer =
SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap();
assert_eq!(
tokenizer,
SearchTokenizer::KoreanLindera(SearchTokenizerFilters {
remove_short: None,
remove_long: None,
lowercase: None,
stemmer: None,
stopwords_language: None,
stopwords: None,
ascii_folding: None,
trim: Some(true),
normalizer: None,
alpha_num_only: None,
})
);
// Test that the tokenizer is created successfully
let mut analyzer = tokenizer.to_tantivy_tokenizer().unwrap();
// Test tokenizing Korean text with spaces
// "아름다운 우리나라" (Beautiful our country)
let text = "아름다운 우리나라";
let mut token_stream = analyzer.token_stream(text);
let mut tokens = Vec::new();
while token_stream.advance() {
let token = token_stream.token();
tokens.push(token.text.clone());
}
// Verify that space tokens are filtered out
assert!(!tokens.contains(&" ".to_string()));
assert!(!tokens.iter().any(|t| t.trim().is_empty()));
// Verify that Korean words are still present
assert!(!tokens.is_empty());
}
#[rstest]
fn test_chinese_lindera_tokenizer_preserves_whitespace() {
use tantivy::tokenizer::TokenStream;
// Test Chinese Lindera tokenizer preserves whitespace by default
// (backward compatible behavior)
let json = r#"{
"type": "chinese_lindera"
}"#;
let tokenizer =
SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap();
// Test that the tokenizer is created successfully
let mut analyzer = tokenizer.to_tantivy_tokenizer().unwrap();
// Test tokenizing text with spaces
let text = "this is a test";
let mut token_stream = analyzer.token_stream(text);
let mut tokens = Vec::new();
while token_stream.advance() {
let token = token_stream.token();
tokens.push(token.text.clone());
}
// Verify that space tokens are preserved (backward compatible behavior)
assert!(tokens.contains(&" ".to_string()));
// Verify that words are still present
assert!(tokens.contains(&"this".to_string()));
assert!(tokens.contains(&"is".to_string()));
assert!(tokens.contains(&"a".to_string()));
assert!(tokens.contains(&"test".to_string()));
}
#[rstest]
fn test_trim_filter_with_multiple_tokenizers() {
use tantivy::tokenizer::TokenStream;
// Test that trim filter works across different tokenizers
// Test 1: Chinese Lindera tokenizer with trim filter
let json_lindera = r#"{
"type": "chinese_lindera",
"trim": true
}"#;
let tokenizer_lindera =
SearchTokenizer::from_json_value(&serde_json::from_str(json_lindera).unwrap()).unwrap();
let mut analyzer_lindera = tokenizer_lindera.to_tantivy_tokenizer().unwrap();
let text_lindera = "富裕 劳动力";
let mut token_stream_lindera = analyzer_lindera.token_stream(text_lindera);
let mut tokens_lindera = Vec::new();
while token_stream_lindera.advance() {
let token = token_stream_lindera.token();
tokens_lindera.push(token.text.clone());
}
// Verify no whitespace tokens
assert!(!tokens_lindera.contains(&" ".to_string()));
assert!(!tokens_lindera.iter().any(|t| t.trim().is_empty()));
assert!(!tokens_lindera.is_empty());
// Test 2: Chinese Compatible tokenizer with trim filter
let json_chinese = r#"{
"type": "chinese_compatible",
"trim": true
}"#;
let tokenizer_chinese =
SearchTokenizer::from_json_value(&serde_json::from_str(json_chinese).unwrap()).unwrap();
let mut analyzer_chinese = tokenizer_chinese.to_tantivy_tokenizer().unwrap();
let text_chinese = "中文 测试 文本";
let mut token_stream_chinese = analyzer_chinese.token_stream(text_chinese);
let mut tokens_chinese = Vec::new();
while token_stream_chinese.advance() {
let token = token_stream_chinese.token();
tokens_chinese.push(token.text.clone());
}
// Verify no whitespace tokens
assert!(!tokens_chinese.contains(&" ".to_string()));
assert!(!tokens_chinese.iter().any(|t| t.trim().is_empty()));
}
}