Introducing a wrapper struct instead of Boxed<BoxableTokenizer> (#631)

Closes #629
This commit is contained in:
Paul Masurel
2019-08-15 16:37:04 +09:00
committed by GitHub
parent b3b0138b82
commit 039c0a0863
7 changed files with 73 additions and 55 deletions

View File

@@ -6,7 +6,11 @@ Tantivy 0.11.0
- Better handling of hyphens in query parser. (#609)
- Better handling of whitespaces.
- Closes #498 - add support for Elastic-style unbounded range queries for alphanumeric types eg. "title:>hello", "weight:>=70.5", "height:<200" (@petr-tik)
- API change around `Box<BoxableTokenizer>`. See detail in #629
## How to update?
`Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
Tantivy 0.10.1
=====================

View File

@@ -173,11 +173,11 @@ impl Index {
}
/// Helper to access the tokenizer associated to a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<dyn BoxedTokenizer>> {
pub fn tokenizer_for_field(&self, field: Field) -> Result<BoxedTokenizer> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<Box<dyn BoxedTokenizer>> = match field_type {
let tokenizer_name_opt: Option<BoxedTokenizer> = match field_type {
FieldType::Str(text_options) => text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())

View File

@@ -49,7 +49,7 @@ pub struct SegmentWriter {
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<Box<dyn BoxedTokenizer>>>,
tokenizers: Vec<Option<BoxedTokenizer>>,
}
impl SegmentWriter {

View File

@@ -63,7 +63,7 @@ impl FragmentCandidate {
fn try_add_token(&mut self, token: &Token, terms: &BTreeMap<String, f32>) {
self.stop_offset = token.offset_to;
if let Some(score) = terms.get(&token.text.to_lowercase()) {
if let Some(&score) = terms.get(&token.text.to_lowercase()) {
self.score += score;
self.highlighted
.push(HighlightSection::new(token.offset_from, token.offset_to));
@@ -142,7 +142,7 @@ impl Snippet {
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
/// has to be a valid string.
fn search_fragments<'a>(
tokenizer: &dyn BoxedTokenizer,
tokenizer: &BoxedTokenizer,
text: &'a str,
terms: &BTreeMap<String, f32>,
max_num_chars: usize,
@@ -150,7 +150,6 @@ fn search_fragments<'a>(
let mut token_stream = tokenizer.token_stream(text);
let mut fragment = FragmentCandidate::new(0);
let mut fragments: Vec<FragmentCandidate> = vec![];
while let Some(next) = token_stream.next() {
if (next.offset_to - fragment.start_offset) > max_num_chars {
if fragment.score > 0.0 {
@@ -254,7 +253,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// ```
pub struct SnippetGenerator {
terms_text: BTreeMap<String, f32>,
tokenizer: Box<dyn BoxedTokenizer>,
tokenizer: BoxedTokenizer,
field: Field,
max_num_chars: usize,
}
@@ -316,12 +315,8 @@ impl SnippetGenerator {
/// Generates a snippet for the given text.
pub fn snippet(&self, text: &str) -> Snippet {
let fragment_candidates = search_fragments(
&*self.tokenizer,
&text,
&self.terms_text,
self.max_num_chars,
);
let fragment_candidates =
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars);
select_best_fragment_combination(&fragment_candidates[..], &text)
}
}
@@ -331,7 +326,7 @@ mod tests {
use super::{search_fragments, select_best_fragment_combination};
use crate::query::QueryParser;
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
use crate::tokenizer::{box_tokenizer, SimpleTokenizer};
use crate::tokenizer::SimpleTokenizer;
use crate::Index;
use crate::SnippetGenerator;
use maplit::btreemap;
@@ -355,12 +350,12 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let terms = btreemap! {
String::from("rust") => 1.0,
String::from("language") => 0.9
};
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100);
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 100);
assert_eq!(fragments.len(), 7);
{
let first = &fragments[0];
@@ -382,13 +377,13 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_scored_fragment() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
{
let terms = btreemap! {
String::from("rust") =>1.0f32,
String::from("language") => 0.9f32
};
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
{
let first = &fragments[0];
assert_eq!(first.score, 1.0);
@@ -397,13 +392,13 @@ Survey in 2016, 2017, and 2018."#;
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
}
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
{
let terms = btreemap! {
String::from("rust") =>0.9f32,
String::from("language") => 1.0f32
};
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
//assert_eq!(fragments.len(), 7);
{
let first = &fragments[0];
@@ -417,14 +412,14 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_in_second_fragment() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d e f g";
let mut terms = BTreeMap::new();
terms.insert(String::from("c"), 1.0);
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 1);
{
@@ -441,14 +436,14 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_with_term_at_the_end_of_fragment() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d e f f g";
let mut terms = BTreeMap::new();
terms.insert(String::from("f"), 1.0);
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 2);
{
@@ -465,7 +460,7 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_with_second_fragment_has_the_highest_score() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d e f g";
@@ -473,7 +468,7 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("f"), 1.0);
terms.insert(String::from("a"), 0.9);
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 7);
assert_eq!(fragments.len(), 2);
{
@@ -490,14 +485,14 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_with_term_not_in_text() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d";
let mut terms = BTreeMap::new();
terms.insert(String::from("z"), 1.0);
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 0);
@@ -508,12 +503,12 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_with_no_terms() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d";
let terms = BTreeMap::new();
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], &text);

View File

@@ -155,7 +155,6 @@ pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain;
pub(crate) use self::tokenizer::box_tokenizer;
pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};

View File

@@ -56,8 +56,6 @@ pub trait Tokenizer<'a>: Sized + Clone {
/// # Example
///
/// ```rust
/// # extern crate tantivy;
///
/// use tantivy::tokenizer::*;
///
/// # fn main() {
@@ -80,7 +78,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
}
/// A boxed tokenizer
pub trait BoxedTokenizer: Send + Sync {
trait BoxedTokenizerTrait: Send + Sync {
/// Tokenize a `&str`
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
@@ -92,7 +90,41 @@ pub trait BoxedTokenizer: Send + Sync {
fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b>;
/// Return a boxed clone of the tokenizer
fn boxed_clone(&self) -> Box<dyn BoxedTokenizer>;
fn boxed_clone(&self) -> BoxedTokenizer;
}
/// A boxed tokenizer
pub struct BoxedTokenizer(Box<dyn BoxedTokenizerTrait>);
impl<T> From<T> for BoxedTokenizer
where
T: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
fn from(tokenizer: T) -> BoxedTokenizer {
BoxedTokenizer(Box::new(BoxableTokenizer(tokenizer)))
}
}
impl BoxedTokenizer {
/// Tokenize a `&str`
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
self.0.token_stream(text)
}
/// Tokenize an array`&str`
///
/// The resulting `TokenStream` is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
pub fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b> {
self.0.token_stream_texts(texts)
}
}
impl Clone for BoxedTokenizer {
fn clone(&self) -> BoxedTokenizer {
self.0.boxed_clone()
}
}
#[derive(Clone)]
@@ -100,7 +132,7 @@ struct BoxableTokenizer<A>(A)
where
A: for<'a> Tokenizer<'a> + Send + Sync;
impl<A> BoxedTokenizer for BoxableTokenizer<A>
impl<A> BoxedTokenizerTrait for BoxableTokenizer<A>
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
@@ -125,18 +157,11 @@ where
}
}
fn boxed_clone(&self) -> Box<dyn BoxedTokenizer> {
Box::new(self.clone())
fn boxed_clone(&self) -> BoxedTokenizer {
self.0.clone().into()
}
}
pub(crate) fn box_tokenizer<A>(a: A) -> Box<dyn BoxedTokenizer>
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
Box::new(BoxableTokenizer(a))
}
impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
@@ -161,7 +186,6 @@ impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
/// # Example
///
/// ```
/// extern crate tantivy;
/// use tantivy::tokenizer::*;
///
/// # fn main() {
@@ -203,7 +227,6 @@ pub trait TokenStream {
/// and `.token()`.
///
/// ```
/// # extern crate tantivy;
/// # use tantivy::tokenizer::*;
/// #
/// # fn main() {

View File

@@ -1,4 +1,3 @@
use crate::tokenizer::box_tokenizer;
use crate::tokenizer::stemmer::Language;
use crate::tokenizer::BoxedTokenizer;
use crate::tokenizer::LowerCaser;
@@ -8,7 +7,6 @@ use crate::tokenizer::SimpleTokenizer;
use crate::tokenizer::Stemmer;
use crate::tokenizer::Tokenizer;
use std::collections::HashMap;
use std::ops::Deref;
use std::sync::{Arc, RwLock};
/// The tokenizer manager serves as a store for
@@ -25,16 +23,16 @@ use std::sync::{Arc, RwLock};
/// search engine.
#[derive(Clone)]
pub struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, Box<dyn BoxedTokenizer>>>>,
tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
}
impl TokenizerManager {
/// Registers a new tokenizer associated with a given name.
pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A)
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
A: Into<BoxedTokenizer>,
{
let boxed_tokenizer = box_tokenizer(tokenizer);
let boxed_tokenizer = tokenizer.into();
self.tokenizers
.write()
.expect("Acquiring the lock should never fail")
@@ -42,13 +40,12 @@ impl TokenizerManager {
}
/// Accessing a tokenizer given its name.
pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn BoxedTokenizer>> {
pub fn get(&self, tokenizer_name: &str) -> Option<BoxedTokenizer> {
self.tokenizers
.read()
.expect("Acquiring the lock should never fail")
.get(tokenizer_name)
.map(Deref::deref)
.map(BoxedTokenizer::boxed_clone)
.cloned()
}
}