Files
tantivy/src/tokenizer/facet_tokenizer.rs
Hamir Mahal 0c634adbe1 style: simplify strings with string interpolation (#2412)
* style: simplify strings with string interpolation

* fix: formatting
2024-05-27 09:16:47 +02:00

129 lines
3.8 KiB
Rust

use super::{Token, TokenStream, Tokenizer};
use crate::schema::FACET_SEP_BYTE;
/// The `FacetTokenizer` process a `Facet` binary representation
/// and emits a token for all of its parent.
///
/// For instance, `/america/north_america/canada`
/// will emit the three following tokens
/// - `/america/north_america/canada`
/// - `/america/north_america`
/// - `/america`
#[derive(Clone, Default)]
pub struct FacetTokenizer {
token: Token,
}
#[derive(Debug)]
enum State {
RootFacetNotEmitted,
UpToPosition(usize), //< we already emitted facet prefix up to &text[..cursor]
Terminated,
}
pub struct FacetTokenStream<'a> {
text: &'a str,
state: State,
token: &'a mut Token,
}
impl Tokenizer for FacetTokenizer {
type TokenStream<'a> = FacetTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> {
self.token.reset();
self.token.position = 0;
FacetTokenStream {
text,
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
token: &mut self.token,
}
}
}
impl<'a> TokenStream for FacetTokenStream<'a> {
fn advance(&mut self) -> bool {
match self.state {
State::RootFacetNotEmitted => {
self.state = if self.text.is_empty() {
State::Terminated
} else {
State::UpToPosition(0)
};
true
}
State::UpToPosition(cursor) => {
let bytes: &[u8] = self.text.as_bytes();
if let Some(next_sep_pos) = bytes[cursor + 1..]
.iter()
.cloned()
.position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos)
{
let facet_part = &self.text[cursor..next_sep_pos];
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
} else {
let facet_part = &self.text[cursor..];
self.token.text.push_str(facet_part);
self.state = State::Terminated;
}
true
}
State::Terminated => false,
}
}
fn token(&self) -> &Token {
self.token
}
fn token_mut(&mut self) -> &mut Token {
self.token
}
}
#[cfg(test)]
mod tests {
use super::FacetTokenizer;
use crate::schema::Facet;
use crate::tokenizer::{Token, TokenStream, Tokenizer};
#[test]
fn test_facet_tokenizer() {
let facet = Facet::from_path(vec!["top", "a", "b"]);
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{facet}"));
};
FacetTokenizer::default()
.token_stream(facet.encoded_str())
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0], "/");
assert_eq!(tokens[1], "/top");
assert_eq!(tokens[2], "/top/a");
assert_eq!(tokens[3], "/top/a/b");
}
#[test]
fn test_facet_tokenizer_root_facets() {
let facet = Facet::root();
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{facet}"));
};
FacetTokenizer::default()
.token_stream(facet.encoded_str()) // ok test
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], "/");
}
}