mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-27 20:42:54 +00:00
Compare commits
5 Commits
raphael_op
...
kkoziara-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69e8495724 | ||
|
|
7ddc6041a6 | ||
|
|
d87b7f230d | ||
|
|
20d2235d4d | ||
|
|
faaecad476 |
@@ -10,6 +10,8 @@ Tantivy 0.11.0
|
||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
||||
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
||||
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
|
||||
- Added handling of pre-tokenized text fields (#642), which will enable users to
|
||||
load tokens created outside tantivy. See usage in examples/pre_tokenized_text. (@kkoziara)
|
||||
- Fix crash when committing multiple times with deleted documents. #681 (@brainlock)
|
||||
|
||||
## How to update?
|
||||
|
||||
140
examples/pre_tokenized_text.rs
Normal file
140
examples/pre_tokenized_text.rs
Normal file
@@ -0,0 +1,140 @@
|
||||
// # Pre-tokenized text example
|
||||
//
|
||||
// This example shows how to use pre-tokenized text. Sometimes yout might
|
||||
// want to index and search through text which is already split into
|
||||
// tokens by some external tool.
|
||||
//
|
||||
// In this example we will:
|
||||
// - use tantivy tokenizer to create tokens and load them directly into tantivy,
|
||||
// - import tokenized text straight from json,
|
||||
// - perform a search on documents with pre-tokenized text
|
||||
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = SimpleTokenizer.token_stream(text);
|
||||
let mut tokens = vec![];
|
||||
while token_stream.advance() {
|
||||
tokens.push(token_stream.token().clone());
|
||||
}
|
||||
tokens
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let title_text = "The Old Man and the Sea";
|
||||
let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream";
|
||||
|
||||
// Content of our first document
|
||||
// We create `PreTokenizedString` which contains original text and vector of tokens
|
||||
let title_tok = PreTokenizedString {
|
||||
text: String::from(title_text),
|
||||
tokens: pre_tokenize_text(title_text),
|
||||
};
|
||||
|
||||
println!(
|
||||
"Original text: \"{}\" and tokens: {:?}",
|
||||
title_tok.text, title_tok.tokens
|
||||
);
|
||||
|
||||
let body_tok = PreTokenizedString {
|
||||
text: String::from(body_text),
|
||||
tokens: pre_tokenize_text(body_text),
|
||||
};
|
||||
|
||||
// Now lets create a document and add our `PreTokenizedString` using
|
||||
// `add_pre_tokenized_text` method of `Document`
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_pre_tokenized_text(title, &title_tok);
|
||||
old_man_doc.add_pre_tokenized_text(body, &body_tok);
|
||||
|
||||
// ... now let's just add it to the IndexWriter
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
// Pretokenized text can also be fed as JSON
|
||||
let short_man_json = r#"{
|
||||
"title":[{
|
||||
"text":"The Old Man",
|
||||
"tokens":[
|
||||
{"offset_from":0,"offset_to":3,"position":0,"text":"The","position_length":1},
|
||||
{"offset_from":4,"offset_to":7,"position":1,"text":"Old","position_length":1},
|
||||
{"offset_from":8,"offset_to":11,"position":2,"text":"Man","position_length":1}
|
||||
]
|
||||
}]
|
||||
}"#;
|
||||
|
||||
let short_man_doc = schema.parse_document(&short_man_json)?;
|
||||
|
||||
index_writer.add_document(short_man_doc);
|
||||
|
||||
// Let's commit changes
|
||||
index_writer.commit()?;
|
||||
|
||||
// ... and now is the time to query our index
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// We want to get documents with token "Man", we will use TermQuery to do it
|
||||
// Using PreTokenizedString means the tokens are stored as is avoiding stemming
|
||||
// and lowercasing, which preserves full words in their original form
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(title, "Man"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(count, 2);
|
||||
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("Document: {}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
// In contrary to the previous query, when we search for the "man" term we
|
||||
// should get no results, as it's not one of the indexed tokens. SimpleTokenizer
|
||||
// only splits text on whitespace / punctuation.
|
||||
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(title, "man"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (_top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(count, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -13,7 +13,8 @@ use crate::schema::Value;
|
||||
use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::FacetTokenizer;
|
||||
use crate::tokenizer::{TokenStream, Tokenizer};
|
||||
use crate::tokenizer::PreTokenizedStream;
|
||||
use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer};
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
@@ -158,26 +159,43 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let num_tokens = if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.field_id() as usize]
|
||||
{
|
||||
let texts: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
if texts.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &mut token_stream)
|
||||
let mut token_streams: Vec<Box<dyn TokenStream>> = vec![];
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
|
||||
for field_value in field_values {
|
||||
match field_value.value() {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
offsets.push(total_offset);
|
||||
if let Some(last_token) = tok_str.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
token_streams
|
||||
.push(Box::new(PreTokenizedStream::from(tok_str.clone())));
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.field_id() as usize]
|
||||
{
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
|
||||
token_streams.push(tokenizer.token_stream(text));
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
} else {
|
||||
}
|
||||
|
||||
let num_tokens = if token_streams.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream: Box<dyn TokenStream> =
|
||||
Box::new(TokenStreamChain::new(offsets, token_streams));
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &mut token_stream)
|
||||
};
|
||||
|
||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::*;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
use itertools::Itertools;
|
||||
use std::io::{self, Read, Write};
|
||||
@@ -78,6 +79,16 @@ impl Document {
|
||||
self.add(FieldValue::new(field, value));
|
||||
}
|
||||
|
||||
/// Add a pre-tokenized text field.
|
||||
pub fn add_pre_tokenized_text(
|
||||
&mut self,
|
||||
field: Field,
|
||||
pre_tokenized_text: &PreTokenizedString,
|
||||
) {
|
||||
let value = Value::PreTokStr(pre_tokenized_text.clone());
|
||||
self.add(FieldValue::new(field, value));
|
||||
}
|
||||
|
||||
/// Add a u64 field
|
||||
pub fn add_u64(&mut self, field: Field, value: u64) {
|
||||
self.add(FieldValue::new(field, Value::U64(value)));
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use base64::decode;
|
||||
|
||||
use crate::schema::{IntOptions, TextOptions};
|
||||
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{IntOptions, TextOptions};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use serde_json::Value as JsonValue;
|
||||
|
||||
/// Possible error that may occur while parsing a field value
|
||||
@@ -169,6 +169,28 @@ impl FieldType {
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
},
|
||||
JsonValue::Object(_) => match *self {
|
||||
FieldType::Str(_) => {
|
||||
if let Ok(tok_str_val) =
|
||||
serde_json::from_value::<PreTokenizedString>(json.clone())
|
||||
{
|
||||
Ok(Value::PreTokStr(tok_str_val))
|
||||
} else {
|
||||
let msg = format!(
|
||||
"Json value {:?} cannot be translated to PreTokenizedString.",
|
||||
json
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json, self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
},
|
||||
_ => {
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
@@ -184,7 +206,9 @@ impl FieldType {
|
||||
mod tests {
|
||||
use super::FieldType;
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::TextOptions;
|
||||
use crate::schema::Value;
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
|
||||
#[test]
|
||||
fn test_bytes_value_from_json() {
|
||||
@@ -205,4 +229,71 @@ mod tests {
|
||||
_ => panic!("Expected parse failure for invalid base64"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pre_tok_str_value_from_json() {
|
||||
let pre_tokenized_string_json = r#"{
|
||||
"text": "The Old Man",
|
||||
"tokens": [
|
||||
{
|
||||
"offset_from": 0,
|
||||
"offset_to": 3,
|
||||
"position": 0,
|
||||
"text": "The",
|
||||
"position_length": 1
|
||||
},
|
||||
{
|
||||
"offset_from": 4,
|
||||
"offset_to": 7,
|
||||
"position": 1,
|
||||
"text": "Old",
|
||||
"position_length": 1
|
||||
},
|
||||
{
|
||||
"offset_from": 8,
|
||||
"offset_to": 11,
|
||||
"position": 2,
|
||||
"text": "Man",
|
||||
"position_length": 1
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
|
||||
let expected_value = Value::PreTokStr(PreTokenizedString {
|
||||
text: String::from("The Old Man"),
|
||||
tokens: vec![
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 3,
|
||||
position: 0,
|
||||
text: String::from("The"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 4,
|
||||
offset_to: 7,
|
||||
position: 1,
|
||||
text: String::from("Old"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 8,
|
||||
offset_to: 11,
|
||||
position: 2,
|
||||
text: String::from("Man"),
|
||||
position_length: 1,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
let deserialized_value = FieldType::Str(TextOptions::default())
|
||||
.value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap())
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(deserialized_value, expected_value);
|
||||
|
||||
let serialized_value_json = serde_json::to_string_pretty(&expected_value).unwrap();
|
||||
|
||||
assert_eq!(serialized_value_json, pre_tokenized_string_json);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
@@ -10,6 +11,8 @@ use std::{cmp::Ordering, fmt};
|
||||
pub enum Value {
|
||||
/// The str type is used for any text information.
|
||||
Str(String),
|
||||
/// Pre-tokenized str type,
|
||||
PreTokStr(PreTokenizedString),
|
||||
/// Unsigned 64-bits Integer `u64`
|
||||
U64(u64),
|
||||
/// Signed 64-bits Integer `i64`
|
||||
@@ -29,6 +32,7 @@ impl Ord for Value {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
match (self, other) {
|
||||
(Value::Str(l), Value::Str(r)) => l.cmp(r),
|
||||
(Value::PreTokStr(l), Value::PreTokStr(r)) => l.cmp(r),
|
||||
(Value::U64(l), Value::U64(r)) => l.cmp(r),
|
||||
(Value::I64(l), Value::I64(r)) => l.cmp(r),
|
||||
(Value::Date(l), Value::Date(r)) => l.cmp(r),
|
||||
@@ -44,6 +48,8 @@ impl Ord for Value {
|
||||
}
|
||||
(Value::Str(_), _) => Ordering::Less,
|
||||
(_, Value::Str(_)) => Ordering::Greater,
|
||||
(Value::PreTokStr(_), _) => Ordering::Less,
|
||||
(_, Value::PreTokStr(_)) => Ordering::Greater,
|
||||
(Value::U64(_), _) => Ordering::Less,
|
||||
(_, Value::U64(_)) => Ordering::Greater,
|
||||
(Value::I64(_), _) => Ordering::Less,
|
||||
@@ -65,6 +71,7 @@ impl Serialize for Value {
|
||||
{
|
||||
match *self {
|
||||
Value::Str(ref v) => serializer.serialize_str(v),
|
||||
Value::PreTokStr(ref v) => v.serialize(serializer),
|
||||
Value::U64(u) => serializer.serialize_u64(u),
|
||||
Value::I64(u) => serializer.serialize_i64(u),
|
||||
Value::F64(u) => serializer.serialize_f64(u),
|
||||
@@ -124,6 +131,15 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the tokenized text, provided the value is of the `PreTokStr` type.
|
||||
/// (Returns None if the value is not of the `PreTokStr` type).
|
||||
pub fn tokenized_text(&self) -> Option<&PreTokenizedString> {
|
||||
match *self {
|
||||
Value::PreTokStr(ref tok_text) => Some(tok_text),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the u64-value, provided the value is of the `U64` type.
|
||||
///
|
||||
/// # Panics
|
||||
@@ -221,6 +237,7 @@ mod binary_serialize {
|
||||
use super::Value;
|
||||
use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use chrono::{TimeZone, Utc};
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
@@ -231,6 +248,11 @@ mod binary_serialize {
|
||||
const BYTES_CODE: u8 = 4;
|
||||
const DATE_CODE: u8 = 5;
|
||||
const F64_CODE: u8 = 6;
|
||||
const EXT_CODE: u8 = 7;
|
||||
|
||||
// extended types
|
||||
|
||||
const TOK_STR_CODE: u8 = 0;
|
||||
|
||||
impl BinarySerializable for Value {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
@@ -239,6 +261,18 @@ mod binary_serialize {
|
||||
TEXT_CODE.serialize(writer)?;
|
||||
text.serialize(writer)
|
||||
}
|
||||
Value::PreTokStr(ref tok_str) => {
|
||||
EXT_CODE.serialize(writer)?;
|
||||
TOK_STR_CODE.serialize(writer)?;
|
||||
if let Ok(text) = serde_json::to_string(tok_str) {
|
||||
text.serialize(writer)
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"Failed to dump Value::PreTokStr(_) to json.",
|
||||
))
|
||||
}
|
||||
}
|
||||
Value::U64(ref val) => {
|
||||
U64_CODE.serialize(writer)?;
|
||||
val.serialize(writer)
|
||||
@@ -290,6 +324,30 @@ mod binary_serialize {
|
||||
}
|
||||
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
|
||||
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
|
||||
EXT_CODE => {
|
||||
let ext_type_code = u8::deserialize(reader)?;
|
||||
match ext_type_code {
|
||||
TOK_STR_CODE => {
|
||||
let str_val = String::deserialize(reader)?;
|
||||
if let Ok(value) = serde_json::from_str::<PreTokenizedString>(&str_val)
|
||||
{
|
||||
Ok(Value::PreTokStr(value))
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"Failed to parse string data as Value::PreTokStr(_).",
|
||||
))
|
||||
}
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"No extened field type is associated with code {:?}",
|
||||
ext_type_code
|
||||
),
|
||||
)),
|
||||
}
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {:?}", type_code),
|
||||
|
||||
@@ -136,6 +136,7 @@ mod simple_tokenizer;
|
||||
mod stemmer;
|
||||
mod stop_word_filter;
|
||||
mod token_stream_chain;
|
||||
mod tokenized_string;
|
||||
mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
|
||||
@@ -152,7 +153,9 @@ pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
/// Maximum authorized len (in bytes) for a token.
|
||||
|
||||
191
src/tokenizer/tokenized_string.rs
Normal file
191
src/tokenizer/tokenized_string.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
|
||||
use std::cmp::Ordering;
|
||||
|
||||
/// Struct representing pre-tokenized text
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct PreTokenizedString {
|
||||
/// Original text
|
||||
pub text: String,
|
||||
/// Tokens derived from the text
|
||||
pub tokens: Vec<Token>,
|
||||
}
|
||||
|
||||
impl Ord for PreTokenizedString {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.text.cmp(&other.text)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for PreTokenizedString {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// TokenStream implementation which wraps PreTokenizedString
|
||||
pub struct PreTokenizedStream {
|
||||
tokenized_string: PreTokenizedString,
|
||||
current_token: i64,
|
||||
}
|
||||
|
||||
impl From<PreTokenizedString> for PreTokenizedStream {
|
||||
fn from(s: PreTokenizedString) -> PreTokenizedStream {
|
||||
PreTokenizedStream {
|
||||
tokenized_string: s,
|
||||
current_token: -1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PreTokenizedStream {
|
||||
/// Creates a TokenStream from PreTokenizedString array
|
||||
pub fn chain_tokenized_strings<'a>(
|
||||
tok_strings: &'a [&'a PreTokenizedString],
|
||||
) -> Box<dyn TokenStream + 'a> {
|
||||
if tok_strings.len() == 1 {
|
||||
return Box::new(PreTokenizedStream::from((*tok_strings[0]).clone()));
|
||||
}
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &tok_string in tok_strings {
|
||||
offsets.push(total_offset);
|
||||
if let Some(last_token) = tok_string.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
}
|
||||
let token_streams: Vec<_> = tok_strings
|
||||
.iter()
|
||||
.map(|tok_string| PreTokenizedStream::from((*tok_string).clone()))
|
||||
.collect();
|
||||
Box::new(TokenStreamChain::new(offsets, token_streams))
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for PreTokenizedStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.current_token >= self.tokenized_string.tokens.len() as i64 - 1 {
|
||||
// This was our last token.
|
||||
return false;
|
||||
}
|
||||
self.current_token += 1;
|
||||
true
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
assert!(
|
||||
self.current_token >= 0,
|
||||
"TokenStream not initialized. You should call advance() at least once."
|
||||
);
|
||||
&self.tokenized_string.tokens[self.current_token as usize]
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
assert!(
|
||||
self.current_token >= 0,
|
||||
"TokenStream not initialized. You should call advance() at least once."
|
||||
);
|
||||
&mut self.tokenized_string.tokens[self.current_token as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::tokenizer::Token;
|
||||
|
||||
#[test]
|
||||
fn test_tokenized_stream() {
|
||||
let tok_text = PreTokenizedString {
|
||||
text: String::from("A a"),
|
||||
tokens: vec![
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 1,
|
||||
position: 0,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
position: 1,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
let mut tok_stream = PreTokenizedStream::from(tok_text.clone());
|
||||
|
||||
let mut i = 0;
|
||||
while tok_stream.advance() {
|
||||
assert!(*tok_stream.token() == tok_text.tokens[i]);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chain_tokenized_strings() {
|
||||
let tok_text = PreTokenizedString {
|
||||
text: String::from("A a"),
|
||||
tokens: vec![
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 1,
|
||||
position: 0,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
position: 1,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
let chain_parts = vec![&tok_text, &tok_text];
|
||||
|
||||
let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
|
||||
|
||||
let expected_tokens = vec![
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 1,
|
||||
position: 0,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
position: 1,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 3,
|
||||
offset_to: 4,
|
||||
position: 3,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 5,
|
||||
offset_to: 6,
|
||||
position: 4,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
];
|
||||
for expected_token in expected_tokens {
|
||||
assert!(token_stream.advance());
|
||||
assert_eq!(token_stream.token(), &expected_token);
|
||||
}
|
||||
assert!(!token_stream.advance());
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,7 @@ use crate::tokenizer::TokenStreamChain;
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
|
||||
/// Token
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct Token {
|
||||
/// Offset (byte index) of the first character of the token.
|
||||
/// Offsets shall not be modified by token filters.
|
||||
|
||||
Reference in New Issue
Block a user