// Copyright (c) 2023-2026 ParadeDB, Inc.
//
// This file is part of ParadeDB - Postgres for Search and Analytics
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
/// `TokenLengthFilter` removes tokens that are longer
/// than a given number of bytes or shorter than a given number of bytes (in UTF-8 representation).
#[derive(Clone)]
pub struct TokenLengthFilter {
min: Option,
max: Option,
}
impl TokenLengthFilter {
/// Creates a `TokenLengthFilter` given a minimum and maximum number of bytes of the UTF-8 representation.
pub fn new(min: Option, max: Option) -> TokenLengthFilter {
TokenLengthFilter { min, max }
}
}
impl TokenLengthFilterStream {
fn predicate(&self, token: &Token) -> bool {
if let Some(min) = self.min {
if token.text.len() < min {
return false;
}
}
if let Some(max) = self.max {
if token.text.len() > max {
return false;
}
}
true
}
}
impl TokenFilter for TokenLengthFilter {
type Tokenizer = TokenLengthFilterWrapper;
fn transform(self, tokenizer: T) -> TokenLengthFilterWrapper {
TokenLengthFilterWrapper {
min: self.min,
max: self.max,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct TokenLengthFilterWrapper {
min: Option,
max: Option,
inner: T,
}
impl Tokenizer for TokenLengthFilterWrapper {
type TokenStream<'a> = TokenLengthFilterStream>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
TokenLengthFilterStream {
min: self.min,
max: self.max,
tail: self.inner.token_stream(text),
}
}
}
pub struct TokenLengthFilterStream {
min: Option,
max: Option,
tail: T,
}
impl TokenStream for TokenLengthFilterStream {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
#[cfg(test)]
mod tests {
use super::TokenLengthFilter;
use tantivy::tokenizer::{SimpleTokenizer, TextAnalyzer, Token};
#[test]
fn test_token_length() {
let tokens = token_stream_helper(
"a sentence with a veryveryveryveryveryveryveryveryveryveryveryveryverylong token",
Some(3),
Some(20),
);
let expected_tokens = vec![
Token {
offset_from: 2,
offset_to: 10,
position: 1,
text: "sentence".to_owned(),
position_length: 1,
},
Token {
offset_from: 11,
offset_to: 15,
position: 2,
text: "with".to_owned(),
position_length: 1,
},
Token {
offset_from: 75,
offset_to: 80,
position: 5,
text: "token".to_owned(),
position_length: 1,
},
];
assert_eq!(tokens, expected_tokens);
let tokens = token_stream_helper(
"a sentence with a veryveryveryveryveryveryveryveryveryveryveryveryverylong token",
Some(5),
None,
);
let expected_tokens = vec![
Token {
offset_from: 2,
offset_to: 10,
position: 1,
text: "sentence".to_owned(),
position_length: 1,
},
Token {
offset_from: 18,
offset_to: 74,
position: 4,
text: "veryveryveryveryveryveryveryveryveryveryveryveryverylong".to_owned(),
position_length: 1,
},
Token {
offset_from: 75,
offset_to: 80,
position: 5,
text: "token".to_owned(),
position_length: 1,
},
];
assert_eq!(tokens, expected_tokens);
let tokens = token_stream_helper(
"a sentence with a veryveryveryveryveryveryveryveryveryveryveryveryverylong token",
None,
Some(20),
);
let expected_tokens = vec![
Token {
offset_from: 0,
offset_to: 1,
position: 0,
text: "a".to_owned(),
position_length: 1,
},
Token {
offset_from: 2,
offset_to: 10,
position: 1,
text: "sentence".to_owned(),
position_length: 1,
},
Token {
offset_from: 11,
offset_to: 15,
position: 2,
text: "with".to_owned(),
position_length: 1,
},
Token {
offset_from: 16,
offset_to: 17,
position: 3,
text: "a".to_owned(),
position_length: 1,
},
Token {
offset_from: 75,
offset_to: 80,
position: 5,
text: "token".to_owned(),
position_length: 1,
},
];
assert_eq!(tokens, expected_tokens);
}
fn token_stream_helper(text: &str, min: Option, max: Option) -> Vec {
let mut a = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(TokenLengthFilter::new(min, max))
.build();
let mut token_stream = a.token_stream(text);
let mut tokens: Vec = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}