mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 07:00:41 +00:00
Removes TokenStream chain. (#1283)
This change is mostly motivated by the introduction of json object. We need to be able to inject a position object to make the position shift.
This commit is contained in:
@@ -2,7 +2,7 @@ use std::cmp::Ordering;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
|
||||
/// Struct representing pre-tokenized text
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
@@ -40,32 +40,6 @@ impl From<PreTokenizedString> for PreTokenizedStream {
|
||||
}
|
||||
}
|
||||
|
||||
impl PreTokenizedStream {
|
||||
/// Creates a TokenStream from PreTokenizedString array
|
||||
pub fn chain_tokenized_strings<'a>(
|
||||
tok_strings: &'a [&'a PreTokenizedString],
|
||||
) -> BoxTokenStream {
|
||||
if tok_strings.len() == 1 {
|
||||
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &tok_string in tok_strings {
|
||||
offsets.push(total_offset);
|
||||
if let Some(last_token) = tok_string.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
}
|
||||
// TODO remove the string cloning.
|
||||
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
|
||||
.iter()
|
||||
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
|
||||
.collect();
|
||||
TokenStreamChain::new(offsets, token_streams).into()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for PreTokenizedStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.current_token += 1;
|
||||
@@ -125,68 +99,4 @@ mod tests {
|
||||
}
|
||||
assert!(!token_stream.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chain_tokenized_strings() {
|
||||
let tok_text = PreTokenizedString {
|
||||
text: String::from("A a"),
|
||||
tokens: vec![
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 1,
|
||||
position: 0,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
position: 1,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
let chain_parts = vec![&tok_text, &tok_text];
|
||||
|
||||
let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
|
||||
|
||||
let expected_tokens = vec![
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 1,
|
||||
position: 0,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
position: 1,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 3,
|
||||
offset_to: 4,
|
||||
position: 3,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 5,
|
||||
offset_to: 6,
|
||||
position: 4,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
];
|
||||
|
||||
for expected_token in expected_tokens {
|
||||
assert!(token_stream.advance());
|
||||
assert_eq!(token_stream.token(), &expected_token);
|
||||
}
|
||||
assert!(!token_stream.advance());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user