Removes TokenStream chain. (#1283)

This change is mostly motivated by the introduction of json object.

We need to be able to inject a position object to make the position
shift.
This commit is contained in:
Paul Masurel
2022-02-21 09:51:27 +09:00
committed by GitHub
parent cef145790c
commit 4dc80cfa25
7 changed files with 37 additions and 238 deletions

View File

@@ -2,7 +2,7 @@ use std::cmp::Ordering;
use serde::{Deserialize, Serialize};
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
use crate::tokenizer::{Token, TokenStream};
/// Struct representing pre-tokenized text
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
@@ -40,32 +40,6 @@ impl From<PreTokenizedString> for PreTokenizedStream {
}
}
impl PreTokenizedStream {
/// Creates a TokenStream from PreTokenizedString array
pub fn chain_tokenized_strings<'a>(
tok_strings: &'a [&'a PreTokenizedString],
) -> BoxTokenStream {
if tok_strings.len() == 1 {
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
} else {
let mut offsets = vec![];
let mut total_offset = 0;
for &tok_string in tok_strings {
offsets.push(total_offset);
if let Some(last_token) = tok_string.tokens.last() {
total_offset += last_token.offset_to;
}
}
// TODO remove the string cloning.
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
.iter()
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
.collect();
TokenStreamChain::new(offsets, token_streams).into()
}
}
}
impl TokenStream for PreTokenizedStream {
fn advance(&mut self) -> bool {
self.current_token += 1;
@@ -125,68 +99,4 @@ mod tests {
}
assert!(!token_stream.advance());
}
#[test]
fn test_chain_tokenized_strings() {
let tok_text = PreTokenizedString {
text: String::from("A a"),
tokens: vec![
Token {
offset_from: 0,
offset_to: 1,
position: 0,
text: String::from("A"),
position_length: 1,
},
Token {
offset_from: 2,
offset_to: 3,
position: 1,
text: String::from("a"),
position_length: 1,
},
],
};
let chain_parts = vec![&tok_text, &tok_text];
let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
let expected_tokens = vec![
Token {
offset_from: 0,
offset_to: 1,
position: 0,
text: String::from("A"),
position_length: 1,
},
Token {
offset_from: 2,
offset_to: 3,
position: 1,
text: String::from("a"),
position_length: 1,
},
Token {
offset_from: 3,
offset_to: 4,
position: 3,
text: String::from("A"),
position_length: 1,
},
Token {
offset_from: 5,
offset_to: 6,
position: 4,
text: String::from("a"),
position_length: 1,
},
];
for expected_token in expected_tokens {
assert!(token_stream.advance());
assert_eq!(token_stream.token(), &expected_token);
}
assert!(!token_stream.advance());
}
}