// Copyright (c) 2023-2026 ParadeDB, Inc. // // This file is part of ParadeDB - Postgres for Search and Analytics // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . //! Wrapper around tantivy_jieba::JiebaTokenizer that normalizes token positions. //! //! The upstream tantivy_jieba crate incorrectly sets token.position to character //! offsets instead of sequential token ordinals. This causes phrase queries to //! fail because PhraseQuery expects tokens at consecutive positions (0, 1, 2, ...). //! //! This wrapper intercepts the token stream and fixes positions to be sequential. //! //! TODO: See https://github.com/jiegec/tantivy-jieba/issues/24 about fixing this upstream. use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; /// A wrapper around JiebaTokenizer that normalizes token positions to be sequential. #[derive(Clone)] pub struct JiebaTokenizer { inner: tantivy_jieba::JiebaTokenizer, } impl JiebaTokenizer { pub fn new() -> Self { Self { inner: tantivy_jieba::JiebaTokenizer {}, } } } impl Default for JiebaTokenizer { fn default() -> Self { Self::new() } } impl Tokenizer for JiebaTokenizer { type TokenStream<'a> = JiebaTokenStream<'a>; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { let inner = self.inner.token_stream(text); JiebaTokenStream { inner, position: usize::MAX, // Will wrap to 0 on first advance } } } /// Token stream wrapper that fixes positions to be sequential. pub struct JiebaTokenStream<'a> { inner: tantivy_jieba::JiebaTokenStream<'a>, position: usize, } impl TokenStream for JiebaTokenStream<'_> { fn advance(&mut self) -> bool { if self.inner.advance() { // Increment position sequentially instead of using character offsets self.position = self.position.wrapping_add(1); self.inner.token_mut().position = self.position; true } else { false } } fn token(&self) -> &Token { self.inner.token() } fn token_mut(&mut self) -> &mut Token { self.inner.token_mut() } } #[cfg(test)] mod tests { use super::*; fn tokenize(text: &str) -> Vec<(String, usize)> { let mut tokenizer = JiebaTokenizer::new(); let mut stream = tokenizer.token_stream(text); let mut tokens = Vec::new(); while stream.advance() { let token = stream.token(); tokens.push((token.text.clone(), token.position)); } tokens } #[test] fn test_sequential_positions() { // "我们都有光明的前途" should tokenize with sequential positions let tokens = tokenize("我们都有光明的前途"); // Verify positions are sequential (0, 1, 2, 3, ...) for (i, (_, position)) in tokens.iter().enumerate() { assert_eq!( *position, i, "Expected position {} but got {} for token at index {}", i, position, i ); } } #[test] fn test_phrase_compatible_positions() { // "转移就业" tokenizes to ["转移", "就业"] // With the fix, positions should be [0, 1] not [0, 2] let tokens = tokenize("转移就业"); assert_eq!(tokens.len(), 2); assert_eq!(tokens[0].1, 0, "First token should be at position 0"); assert_eq!(tokens[1].1, 1, "Second token should be at position 1"); } }