mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
129 lines
3.8 KiB
Rust
129 lines
3.8 KiB
Rust
use super::{Token, TokenStream, Tokenizer};
|
|
use crate::schema::FACET_SEP_BYTE;
|
|
|
|
/// The `FacetTokenizer` process a `Facet` binary representation
|
|
/// and emits a token for all of its parent.
|
|
///
|
|
/// For instance, `/america/north_america/canada`
|
|
/// will emit the three following tokens
|
|
/// - `/america/north_america/canada`
|
|
/// - `/america/north_america`
|
|
/// - `/america`
|
|
#[derive(Clone, Default)]
|
|
pub struct FacetTokenizer {
|
|
token: Token,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
enum State {
|
|
RootFacetNotEmitted,
|
|
UpToPosition(usize), //< we already emitted facet prefix up to &text[..cursor]
|
|
Terminated,
|
|
}
|
|
|
|
pub struct FacetTokenStream<'a> {
|
|
text: &'a str,
|
|
state: State,
|
|
token: &'a mut Token,
|
|
}
|
|
|
|
impl Tokenizer for FacetTokenizer {
|
|
type TokenStream<'a> = FacetTokenStream<'a>;
|
|
fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> {
|
|
self.token.reset();
|
|
self.token.position = 0;
|
|
FacetTokenStream {
|
|
text,
|
|
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
|
token: &mut self.token,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> TokenStream for FacetTokenStream<'a> {
|
|
fn advance(&mut self) -> bool {
|
|
match self.state {
|
|
State::RootFacetNotEmitted => {
|
|
self.state = if self.text.is_empty() {
|
|
State::Terminated
|
|
} else {
|
|
State::UpToPosition(0)
|
|
};
|
|
true
|
|
}
|
|
State::UpToPosition(cursor) => {
|
|
let bytes: &[u8] = self.text.as_bytes();
|
|
if let Some(next_sep_pos) = bytes[cursor + 1..]
|
|
.iter()
|
|
.cloned()
|
|
.position(|b| b == FACET_SEP_BYTE)
|
|
.map(|pos| cursor + 1 + pos)
|
|
{
|
|
let facet_part = &self.text[cursor..next_sep_pos];
|
|
self.token.text.push_str(facet_part);
|
|
self.state = State::UpToPosition(next_sep_pos);
|
|
} else {
|
|
let facet_part = &self.text[cursor..];
|
|
self.token.text.push_str(facet_part);
|
|
self.state = State::Terminated;
|
|
}
|
|
true
|
|
}
|
|
State::Terminated => false,
|
|
}
|
|
}
|
|
|
|
fn token(&self) -> &Token {
|
|
self.token
|
|
}
|
|
|
|
fn token_mut(&mut self) -> &mut Token {
|
|
self.token
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
|
|
use super::FacetTokenizer;
|
|
use crate::schema::Facet;
|
|
use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
|
|
|
#[test]
|
|
fn test_facet_tokenizer() {
|
|
let facet = Facet::from_path(vec!["top", "a", "b"]);
|
|
let mut tokens = vec![];
|
|
{
|
|
let mut add_token = |token: &Token| {
|
|
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
|
tokens.push(format!("{facet}"));
|
|
};
|
|
FacetTokenizer::default()
|
|
.token_stream(facet.encoded_str())
|
|
.process(&mut add_token);
|
|
}
|
|
assert_eq!(tokens.len(), 4);
|
|
assert_eq!(tokens[0], "/");
|
|
assert_eq!(tokens[1], "/top");
|
|
assert_eq!(tokens[2], "/top/a");
|
|
assert_eq!(tokens[3], "/top/a/b");
|
|
}
|
|
|
|
#[test]
|
|
fn test_facet_tokenizer_root_facets() {
|
|
let facet = Facet::root();
|
|
let mut tokens = vec![];
|
|
{
|
|
let mut add_token = |token: &Token| {
|
|
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
|
tokens.push(format!("{facet}"));
|
|
};
|
|
FacetTokenizer::default()
|
|
.token_stream(facet.encoded_str()) // ok test
|
|
.process(&mut add_token);
|
|
}
|
|
assert_eq!(tokens.len(), 1);
|
|
assert_eq!(tokens[0], "/");
|
|
}
|
|
}
|