// Copyright (c) 2023-2025 ParadeDB, Inc. // // This file is part of ParadeDB - Postgres for Search and Analytics // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . use std::fmt::Write; #[cfg(feature = "icu")] use crate::icu::ICUTokenizer; use crate::{ cjk::ChineseTokenizer, code::CodeTokenizer, lindera::{LinderaChineseTokenizer, LinderaJapaneseTokenizer, LinderaKoreanTokenizer}, DEFAULT_REMOVE_TOKEN_LENGTH, }; use anyhow::Result; use serde::{Deserialize, Serialize}; use serde_json::json; use strum::AsRefStr; use tantivy::tokenizer::{ AsciiFoldingFilter, Language, LowerCaser, NgramTokenizer, RawTokenizer, RegexTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, TextAnalyzer, WhitespaceTokenizer, }; #[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq, Eq)] pub struct SearchTokenizerFilters { pub remove_long: Option, pub lowercase: Option, pub stemmer: Option, } impl SearchTokenizerFilters { /// Returns a [`SearchTokenizerFilter`] instance that effectively does not filter, or otherwise /// mutate tokens. /// /// This should be used for declaring the "key field" in an index. It can be used for other /// text types that don't want tokenization too. pub fn raw() -> Self { SearchTokenizerFilters { remove_long: Some(usize::MAX), lowercase: Some(false), stemmer: None, } } fn from_json_value(value: &serde_json::Value) -> Result { let mut filters = SearchTokenizerFilters::default(); if let Some(remove_long) = value.get("remove_long") { filters.remove_long = Some(remove_long.as_u64().ok_or_else(|| { anyhow::anyhow!( "a 'remove_long' value passed to the pg_search tokenizer configuration \ must be of type u64, found: {remove_long:#?}" ) })? as usize); } if let Some(lowercase) = value.get("lowercase") { filters.lowercase = Some(lowercase.as_bool().ok_or_else(|| { anyhow::anyhow!( "a 'lowercase' value passed to the pg_search tokenizer configuration \ must be of type bool, found: {lowercase:#?}" ) })?); }; if let Some(stemmer) = value.get("stemmer") { filters.stemmer = Some(serde_json::from_value(stemmer.clone()).map_err(|_| { anyhow::anyhow!("stemmer tokenizer requires a valid 'stemmer' field") })?); } Ok(filters) } fn to_json_value(&self, enclosing: &mut serde_json::Value) { let enclosing = enclosing.as_object_mut().expect("object value"); if let Some(value) = self.remove_long { let v = serde_json::Value::Number(value.into()); enclosing.insert("remove_long".to_string(), v); } if let Some(value) = self.lowercase { let v = serde_json::Value::Bool(value); enclosing.insert("lowercase".to_string(), v); } } fn name_suffix(&self) -> String { let mut buffer = String::new(); let mut is_empty = true; fn sep(is_empty: bool) -> &'static str { if is_empty { "" } else { "," } } if let Some(value) = self.remove_long { write!(buffer, "{}remove_long={value}", sep(is_empty)) .expect("Writing to String buffer should never fail"); is_empty = false; } if let Some(value) = self.lowercase { write!(buffer, "{}lowercase={value}", sep(is_empty)) .expect("Writing to String buffer should never fail"); is_empty = false; } if let Some(value) = self.stemmer { write!(buffer, "{}stemmer={value:?}", sep(is_empty)).unwrap(); is_empty = false; } if is_empty { "".into() } else { format!("[{buffer}]") } } fn remove_long_filter(&self) -> Option { let limit = self.remove_long.unwrap_or(DEFAULT_REMOVE_TOKEN_LENGTH); Some(RemoveLongFilter::limit(limit)) } fn lower_caser(&self) -> Option { match self.lowercase { Some(false) => None, // Only disable if explicitly requested. _ => Some(LowerCaser), } } fn stemmer(&self) -> Option { self.stemmer.map(Stemmer::new) } } // Serde will pick a SearchTokenizer variant based on the value of the // "type" key, which needs to match one of the variant names below. // The "type" field will not be present on the deserialized value. // // Ensure that new variants are added to the `to_json_value` and // `from_json_value` methods. We don't use serde_json to ser/de the // SearchTokenizer, because our bincode serialization format is incompatible // with the "tagged" format we use in our public API. #[derive( Serialize, Deserialize, Clone, Debug, PartialEq, Eq, strum_macros::VariantNames, AsRefStr, )] #[strum(serialize_all = "snake_case")] pub enum SearchTokenizer { Default(SearchTokenizerFilters), Raw(SearchTokenizerFilters), EnStem(SearchTokenizerFilters), Stem { language: Language, filters: SearchTokenizerFilters, }, Lowercase(SearchTokenizerFilters), WhiteSpace(SearchTokenizerFilters), RegexTokenizer { pattern: String, filters: SearchTokenizerFilters, }, ChineseCompatible(SearchTokenizerFilters), SourceCode(SearchTokenizerFilters), Ngram { min_gram: usize, max_gram: usize, prefix_only: bool, filters: SearchTokenizerFilters, }, ChineseLindera(SearchTokenizerFilters), JapaneseLindera(SearchTokenizerFilters), KoreanLindera(SearchTokenizerFilters), #[cfg(feature = "icu")] #[strum(serialize = "icu")] ICUTokenizer(SearchTokenizerFilters), } impl Default for SearchTokenizer { fn default() -> Self { Self::Default(SearchTokenizerFilters::default()) } } impl SearchTokenizer { pub fn to_json_value(&self) -> serde_json::Value { let mut json = match self { SearchTokenizer::Default(_filters) => json!({ "type": "default" }), SearchTokenizer::Raw(_filters) => json!({ "type": "raw" }), SearchTokenizer::EnStem(_filters) => json!({ "type": "en_stem" }), SearchTokenizer::Stem { language, filters: _, } => json!({ "type": "stem", "language": language }), SearchTokenizer::Lowercase(_filters) => json!({ "type": "lowercase" }), SearchTokenizer::WhiteSpace(_filters) => json!({ "type": "whitespace" }), SearchTokenizer::RegexTokenizer { pattern, filters: _, } => { json!({ "type": "regex", "pattern": pattern }) } SearchTokenizer::ChineseCompatible(_filters) => json!({ "type": "chinese_compatible" }), SearchTokenizer::SourceCode(_filters) => json!({ "type": "source_code" }), SearchTokenizer::Ngram { min_gram, max_gram, prefix_only, filters: _, } => json!({ "type": "ngram", "min_gram": min_gram, "max_gram": max_gram, "prefix_only": prefix_only, }), SearchTokenizer::ChineseLindera(_filters) => json!({ "type": "chinese_lindera" }), SearchTokenizer::JapaneseLindera(_filters) => json!({ "type": "japanese_lindera" }), SearchTokenizer::KoreanLindera(_filters) => json!({ "type": "korean_lindera" }), #[cfg(feature = "icu")] SearchTokenizer::ICUTokenizer(_filters) => json!({ "type": "icu" }), }; // Serialize filters to the enclosing json object. self.filters().to_json_value(&mut json); json } pub fn from_json_value(value: &serde_json::Value) -> Result { // We use the `type` field of a JSON object to distinguish the tokenizer variant. // Deserialized in this "tagged enum" fashion is not supported by bincode, which // we use elsewhere for serialization, so we manually parse the JSON object here. let tokenizer_type = value["type"] .as_str() .ok_or_else(|| anyhow::anyhow!("a 'type' must be passed in pg_search tokenizer configuration, not found in: {value:#?}"))?; let filters = SearchTokenizerFilters::from_json_value(value)?; match tokenizer_type { "default" => Ok(SearchTokenizer::Default(filters)), "raw" => Ok(SearchTokenizer::Raw(filters)), "en_stem" => Ok(SearchTokenizer::EnStem(filters)), "stem" => { let language: Language = serde_json::from_value(value["language"].clone()) .map_err(|_| { anyhow::anyhow!("stem tokenizer requires a valid 'language' field") })?; Ok(SearchTokenizer::Stem { language, filters }) } "lowercase" => Ok(SearchTokenizer::Lowercase(filters)), "whitespace" => Ok(SearchTokenizer::WhiteSpace(filters)), "regex" => { let pattern: String = serde_json::from_value(value["pattern"].clone()).map_err(|_| { anyhow::anyhow!("regex tokenizer requires a string 'pattern' field") })?; Ok(SearchTokenizer::RegexTokenizer { pattern, filters }) } "chinese_compatible" => Ok(SearchTokenizer::ChineseCompatible(filters)), "source_code" => Ok(SearchTokenizer::SourceCode(filters)), "ngram" => { let min_gram: usize = serde_json::from_value(value["min_gram"].clone()).map_err(|_| { anyhow::anyhow!("ngram tokenizer requires an integer 'min_gram' field") })?; let max_gram: usize = serde_json::from_value(value["max_gram"].clone()).map_err(|_| { anyhow::anyhow!("ngram tokenizer requires an integer 'max_gram' field") })?; let prefix_only: bool = serde_json::from_value(value["prefix_only"].clone()) .map_err(|_| { anyhow::anyhow!("ngram tokenizer requires a boolean 'prefix_only' field") })?; Ok(SearchTokenizer::Ngram { min_gram, max_gram, prefix_only, filters, }) } "chinese_lindera" => Ok(SearchTokenizer::ChineseLindera(filters)), "japanese_lindera" => Ok(SearchTokenizer::JapaneseLindera(filters)), "korean_lindera" => Ok(SearchTokenizer::KoreanLindera(filters)), #[cfg(feature = "icu")] "icu" => Ok(SearchTokenizer::ICUTokenizer(filters)), _ => Err(anyhow::anyhow!( "unknown tokenizer type: {}", tokenizer_type )), } } pub fn to_tantivy_tokenizer(&self) -> Option { match self { SearchTokenizer::Default(filters) => Some( TextAnalyzer::builder(SimpleTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), SearchTokenizer::Raw(filters) => Some( TextAnalyzer::builder(RawTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), // Deprecated, use `raw` with `lowercase` filter instead SearchTokenizer::Lowercase(filters) => Some( TextAnalyzer::builder(RawTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), SearchTokenizer::WhiteSpace(filters) => Some( TextAnalyzer::builder(WhitespaceTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), SearchTokenizer::RegexTokenizer { pattern, filters } => Some( TextAnalyzer::builder(RegexTokenizer::new(pattern.as_str()).unwrap()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), SearchTokenizer::Ngram { min_gram, max_gram, prefix_only, filters, } => Some( TextAnalyzer::builder( NgramTokenizer::new(*min_gram, *max_gram, *prefix_only) .expect("Ngram parameters should be valid parameters for NgramTokenizer"), ) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), SearchTokenizer::ChineseCompatible(filters) => Some( TextAnalyzer::builder(ChineseTokenizer) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), SearchTokenizer::SourceCode(filters) => Some( TextAnalyzer::builder(CodeTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(AsciiFoldingFilter) .filter(filters.stemmer()) .build(), ), SearchTokenizer::ChineseLindera(filters) => Some( TextAnalyzer::builder(LinderaChineseTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), SearchTokenizer::JapaneseLindera(filters) => Some( TextAnalyzer::builder(LinderaJapaneseTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), SearchTokenizer::KoreanLindera(filters) => Some( TextAnalyzer::builder(LinderaKoreanTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), // Deprecated, use `stemmer` filter instead SearchTokenizer::EnStem(filters) => Some( TextAnalyzer::builder(SimpleTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(Stemmer::new(Language::English)) .build(), ), // Deprecated, use `stemmer` filter instead SearchTokenizer::Stem { language, filters } => Some( TextAnalyzer::builder(SimpleTokenizer::default()) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(Stemmer::new(*language)) .build(), ), #[cfg(feature = "icu")] SearchTokenizer::ICUTokenizer(filters) => Some( TextAnalyzer::builder(ICUTokenizer) .filter(filters.remove_long_filter()) .filter(filters.lower_caser()) .filter(filters.stemmer()) .build(), ), } } fn filters(&self) -> &SearchTokenizerFilters { match self { SearchTokenizer::Default(filters) => filters, SearchTokenizer::Raw(filters) => filters, SearchTokenizer::EnStem(filters) => filters, SearchTokenizer::Stem { filters, .. } => filters, SearchTokenizer::Lowercase(filters) => filters, SearchTokenizer::WhiteSpace(filters) => filters, SearchTokenizer::RegexTokenizer { filters, .. } => filters, SearchTokenizer::ChineseCompatible(filters) => filters, SearchTokenizer::SourceCode(filters) => filters, SearchTokenizer::Ngram { filters, .. } => filters, SearchTokenizer::ChineseLindera(filters) => filters, SearchTokenizer::JapaneseLindera(filters) => filters, SearchTokenizer::KoreanLindera(filters) => filters, #[cfg(feature = "icu")] SearchTokenizer::ICUTokenizer(filters) => filters, } } } pub fn language_to_str(lang: &Language) -> &str { match lang { Language::Arabic => "Arabic", Language::Danish => "Danish", Language::Dutch => "Dutch", Language::English => "English", Language::Finnish => "Finnish", Language::French => "French", Language::German => "German", Language::Greek => "Greek", Language::Hungarian => "Hungarian", Language::Italian => "Italian", Language::Norwegian => "Norwegian", Language::Portuguese => "Portuguese", Language::Romanian => "Romanian", Language::Russian => "Russian", Language::Spanish => "Spanish", Language::Swedish => "Swedish", Language::Tamil => "Tamil", Language::Turkish => "Turkish", } } impl SearchTokenizer { pub fn name(&self) -> String { let filters_suffix = self.filters().name_suffix(); match self { SearchTokenizer::Default(_filters) => format!("default{filters_suffix}"), SearchTokenizer::Raw(_filters) => format!("raw{filters_suffix}"), SearchTokenizer::EnStem(_filters) => format!("en_stem{filters_suffix}"), SearchTokenizer::Stem { language, filters: _, } => { let language_suffix = language_to_str(language); format!("stem_{language_suffix}{filters_suffix}") } SearchTokenizer::Lowercase(_filters) => format!("lowercase{filters_suffix}"), SearchTokenizer::WhiteSpace(_filters) => format!("whitespace{filters_suffix}"), SearchTokenizer::RegexTokenizer { .. } => format!("regex{filters_suffix}"), SearchTokenizer::ChineseCompatible(_filters) => { format!("chinese_compatible{filters_suffix}") } SearchTokenizer::SourceCode(_filters) => format!("source_code{filters_suffix}"), SearchTokenizer::Ngram { min_gram, max_gram, prefix_only, filters: _, } => format!("ngram_mingram:{min_gram}_maxgram:{max_gram}_prefixonly:{prefix_only}{filters_suffix}"), SearchTokenizer::ChineseLindera(_filters) => format!("chinese_lindera{filters_suffix}"), SearchTokenizer::JapaneseLindera(_filters) => { format!("japanese_lindera{filters_suffix}") } SearchTokenizer::KoreanLindera(_filters) => format!("korean_lindera{filters_suffix}"), #[cfg(feature = "icu")] SearchTokenizer::ICUTokenizer(_filters) => format!("icu{filters_suffix}"), } } } // Normalizers for fast fields #[derive(Default, Copy, Clone, Deserialize, Serialize, Debug, PartialEq, Eq)] pub enum SearchNormalizer { #[serde(rename = "raw")] #[default] Raw, #[serde(rename = "lowercase")] Lowercase, } impl SearchNormalizer { pub fn name(&self) -> &str { match self { SearchNormalizer::Raw => "raw", SearchNormalizer::Lowercase => "lowercase", } } } #[cfg(test)] mod tests { use super::*; use rstest::*; #[rstest] fn test_search_tokenizer() { let tokenizer = SearchTokenizer::default(); assert_eq!(tokenizer.name(), "default".to_string()); let tokenizer = SearchTokenizer::EnStem(SearchTokenizerFilters { remove_long: Some(999), lowercase: Some(true), stemmer: None, }); assert_eq!( tokenizer.name(), "en_stem[remove_long=999,lowercase=true]".to_string() ); let json = r#"{ "type": "ngram", "min_gram": 20, "max_gram": 60, "prefix_only": true, "remove_long": 123, "lowercase": false }"#; let tokenizer = SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap(); assert_eq!( tokenizer, SearchTokenizer::Ngram { min_gram: 20, max_gram: 60, prefix_only: true, filters: SearchTokenizerFilters { remove_long: Some(123), lowercase: Some(false), stemmer: None } } ); } #[rstest] fn test_regexizer() { let json = r#"{ "type": "regex", "pattern": "a+b*", "remove_long": 100 }"#; let tokenizer = SearchTokenizer::RegexTokenizer { pattern: "a+b*".to_string(), filters: SearchTokenizerFilters { remove_long: Some(100), lowercase: None, stemmer: None, }, }; assert_eq!( tokenizer, SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap() ); } #[rstest] fn test_search_normalizer() { assert_eq!(SearchNormalizer::Lowercase.name(), "lowercase"); assert_ne!(SearchNormalizer::Raw, SearchNormalizer::Lowercase); } }