/*
*
* IMPORTANT NOTICE:
* This file has been copied from Quickwit, an open source project, and is subject to the terms
* and conditions of the GNU Affero General Public License (AGPL) version 3.0.
* Please review the full licensing details at .
* By using this file, you agree to comply with the AGPL v3.0 terms.
*
*/
use std::ops::Range;
use std::str::CharIndices;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
#[derive(Clone, Default)]
pub struct CodeTokenizer(Token);
impl Tokenizer for CodeTokenizer {
type TokenStream<'a> = CodeTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.0.reset();
CodeTokenStream {
chars: text.char_indices(),
state: CodeTokenStreamState::Empty,
text,
token: &mut self.0,
}
}
}
pub struct CodeTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: &'a mut Token,
state: CodeTokenStreamState,
}
impl TokenStream for CodeTokenStream<'_> {
fn advance(&mut self) -> bool {
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
while let Some((next_char_offset, next_char)) = self.chars.next() {
match self.state.advance(next_char_offset, next_char) {
None => {}
Some(token_offsets) => {
self.update_token(token_offsets);
return true;
}
}
}
// No more chars.
match self.state.finalize() {
None => {}
Some(token_offsets) => {
self.update_token(token_offsets);
return true;
}
}
false
}
fn token(&self) -> &Token {
self.token
}
fn token_mut(&mut self) -> &mut Token {
self.token
}
}
impl CodeTokenStream<'_> {
fn update_token(&mut self, token_offsets: Range) {
self.token.offset_from = token_offsets.start;
self.token.offset_to = token_offsets.end;
self.token
.text
.push_str(&self.text[token_offsets.start..token_offsets.end]);
}
}
enum CodeTokenStreamState {
Empty,
ProcessingChars(ProcessingCharsState),
}
struct ProcessingCharsState {
is_first_char: bool,
start_offset: usize,
current_char: char,
current_char_offset: usize,
current_char_type: CharType,
}
type TokenOffsets = Range;
impl CodeTokenStreamState {
fn reset(&mut self) {
*self = CodeTokenStreamState::Empty;
}
fn advance(&mut self, next_char_offset: usize, next_char: char) -> Option {
let next_char_type = get_char_type(next_char);
match self {
Self::Empty => match next_char_type {
CharType::Delimiter => {
self.reset();
None
}
_ => {
*self = CodeTokenStreamState::ProcessingChars(ProcessingCharsState {
is_first_char: true,
start_offset: next_char_offset,
current_char_offset: next_char_offset,
current_char: next_char,
current_char_type: next_char_type,
});
None
}
},
Self::ProcessingChars(state) => {
match (state.current_char_type, next_char_type) {
(_, CharType::Delimiter) => {
let offsets = TokenOffsets {
start: state.start_offset,
end: state.current_char_offset + state.current_char.len_utf8(),
};
self.reset();
Some(offsets)
}
// We do not emit a token if we have only `Ac` (is_first_char = true).
// But we emit the token `AB` if we have `ABCa`,
(CharType::UpperCase, CharType::LowerCase) => {
if state.is_first_char {
state.is_first_char = false;
state.current_char_offset = next_char_offset;
state.current_char = next_char;
state.current_char_type = next_char_type;
None
} else {
let offsets = TokenOffsets {
start: state.start_offset,
end: state.current_char_offset,
};
state.is_first_char = false;
state.start_offset = state.current_char_offset;
state.current_char_offset = next_char_offset;
state.current_char = next_char;
state.current_char_type = next_char_type;
Some(offsets)
}
}
// Don't emit tokens on identical char types.
(CharType::UpperCase, CharType::UpperCase)
| (CharType::LowerCase, CharType::LowerCase)
| (CharType::Numeric, CharType::Numeric) => {
state.is_first_char = false;
state.current_char_offset = next_char_offset;
state.current_char = next_char;
None
}
_ => {
let offsets = TokenOffsets {
start: state.start_offset,
end: state.current_char_offset + state.current_char.len_utf8(),
};
state.is_first_char = true;
state.start_offset = next_char_offset;
state.current_char_offset = next_char_offset;
state.current_char = next_char;
state.current_char_type = next_char_type;
Some(offsets)
}
}
}
}
}
fn finalize(&mut self) -> Option {
match self {
Self::Empty => None,
Self::ProcessingChars(char_state) => {
let offsets = TokenOffsets {
start: char_state.start_offset,
end: char_state.current_char_offset + char_state.current_char.len_utf8(),
};
*self = Self::Empty;
Some(offsets)
}
}
}
}
/// Returns the type of the character:
/// - `UpperCase` for `p{Lu}`.
/// - `LowerCase` for `p{Ll}`.
/// - `Numeric` for `\d`.
/// - `Delimiter` for the remaining characters.
fn get_char_type(c: char) -> CharType {
if c.is_alphabetic() {
if c.is_uppercase() {
CharType::UpperCase
} else {
CharType::LowerCase
}
} else if c.is_numeric() {
return CharType::Numeric;
} else {
return CharType::Delimiter;
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum CharType {
// Equivalent of regex `p{Lu}`.
UpperCase,
// Equivalent of regex `p{Ll}`.
LowerCase,
// Equivalent of regex `\d`.
Numeric,
// Other characters.
Delimiter,
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::*;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
#[rstest]
fn test_code_tokenizer() {
let mut tokenizer = CodeTokenizer::default();
{
let mut token_stream = tokenizer.token_stream("PigCaféFactory2");
let mut res = Vec::new();
while let Some(tok) = token_stream.next() {
res.push(tok.clone());
}
let expected_tokens = vec![
Token {
offset_from: 0,
offset_to: 3,
position: 0,
text: "Pig".to_owned(),
position_length: 1,
},
Token {
offset_from: 3,
offset_to: 8,
position: 1,
text: "Café".to_owned(),
position_length: 1,
},
Token {
offset_from: 8,
offset_to: 15,
position: 2,
text: "Factory".to_owned(),
position_length: 1,
},
Token {
offset_from: 15,
offset_to: 16,
position: 3,
text: "2".to_owned(),
position_length: 1,
},
];
assert_eq!(res, expected_tokens);
}
{
let mut token_stream = tokenizer.token_stream("PIG_CAFE_FACTORY");
let mut res = Vec::new();
while let Some(tok) = token_stream.next() {
res.push(tok.clone());
}
let expected_tokens = vec![
Token {
offset_from: 0,
offset_to: 3,
position: 0,
text: "PIG".to_owned(),
position_length: 1,
},
Token {
offset_from: 4,
offset_to: 8,
position: 1,
text: "CAFE".to_owned(),
position_length: 1,
},
Token {
offset_from: 9,
offset_to: 16,
position: 2,
text: "FACTORY".to_owned(),
position_length: 1,
},
];
assert_eq!(res, expected_tokens);
}
{
let mut token_stream = tokenizer.token_stream("TPigCafeFactory");
let mut res = Vec::new();
while let Some(tok) = token_stream.next() {
res.push(tok.clone());
}
let expected_tokens = vec![
Token {
offset_from: 0,
offset_to: 1,
position: 0,
text: "T".to_owned(),
position_length: 1,
},
Token {
offset_from: 1,
offset_to: 4,
position: 1,
text: "Pig".to_owned(),
position_length: 1,
},
Token {
offset_from: 4,
offset_to: 8,
position: 2,
text: "Cafe".to_owned(),
position_length: 1,
},
Token {
offset_from: 8,
offset_to: 15,
position: 3,
text: "Factory".to_owned(),
position_length: 1,
},
];
assert_eq!(res, expected_tokens);
}
{
let mut token_stream = tokenizer.token_stream("PIG# Cafe@FACTORY");
let mut res = Vec::new();
while let Some(tok) = token_stream.next() {
res.push(tok.clone());
}
let expected_tokens = vec![
Token {
offset_from: 0,
offset_to: 3,
position: 0,
text: "PIG".to_owned(),
position_length: 1,
},
Token {
offset_from: 5,
offset_to: 9,
position: 1,
text: "Cafe".to_owned(),
position_length: 1,
},
Token {
offset_from: 10,
offset_to: 17,
position: 2,
text: "FACTORY".to_owned(),
position_length: 1,
},
];
assert_eq!(res, expected_tokens);
}
}
}