cargo fmt

This commit is contained in:
Paul Masurel
2017-11-26 11:02:02 +09:00
parent f30ec9b36b
commit 974c321153
23 changed files with 236 additions and 223 deletions

View File

@@ -36,12 +36,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
let mut schema_builder = SchemaBuilder::default();
// Our first field is title.
// We want full-text search for it, and we also want
// We want full-text search for it, and we also want
// to be able to retrieve the document after the search.
//
//
// TEXT | STORED is some syntactic sugar to describe
// that.
//
//
// `TEXT` means the field should be tokenized and indexed,
// along with its term frequency and term positions.
//
@@ -52,11 +52,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
schema_builder.add_text_field("title", TEXT | STORED);
// Our second field is body.
// We want full-text search for it, but we do not
// We want full-text search for it, but we do not
// need to be able to be able to retrieve it
// for our application.
// for our application.
//
// We can make our index lighter and
// We can make our index lighter and
// by omitting `STORED` flag.
schema_builder.add_text_field("body", TEXT);

View File

@@ -38,7 +38,7 @@ pub struct Index {
directory: ManagedDirectory,
schema: Schema,
searcher_pool: Arc<Pool<Searcher>>,
tokenizers: TokenizerManager
tokenizers: TokenizerManager,
}
@@ -259,7 +259,7 @@ impl Clone for Index {
directory: self.directory.clone(),
schema: self.schema.clone(),
searcher_pool: self.searcher_pool.clone(),
tokenizers: self.tokenizers.clone()
tokenizers: self.tokenizers.clone(),
}
}
}

View File

@@ -138,11 +138,7 @@ impl InvertedIndexReader {
/// For instance, requesting `IndexRecordOption::Freq` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(
&self,
term: &Term,
option: IndexRecordOption,
) -> Option<SegmentPostings> {
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));

View File

@@ -36,7 +36,6 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
}
impl Segment {
/// Returns the index the segment belongs to.
pub fn index(&self) -> &Index {
&self.index

View File

@@ -269,10 +269,8 @@ impl IndexMerger {
let field_entry = self.schema.get_field_entry(indexed_field);
// ... set segment postings option the new field.
let segment_postings_option = field_entry
.field_type()
.get_index_record_option()
.expect(
let segment_postings_option =
field_entry.field_type().get_index_record_option().expect(
"Encountered a field that is not supposed to be
indexed. Have you modified the schema?",
);
@@ -405,9 +403,11 @@ mod tests {
fn test_index_merger_no_deletes() {
let mut schema_builder = schema::SchemaBuilder::default();
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs))
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast();
@@ -539,9 +539,9 @@ mod tests {
fn test_index_merger_with_deletes() {
let mut schema_builder = schema::SchemaBuilder::default();
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqs))
.set_indexing_options(TextFieldIndexing::default().set_index_option(
IndexRecordOption::WithFreqs,
))
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast();

View File

@@ -31,7 +31,7 @@ pub struct SegmentWriter<'a> {
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FastFieldsWriter,
doc_opstamps: Vec<u64>,
tokenizers: Vec<Option<Box<BoxedTokenizer>>>
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
}
@@ -57,40 +57,40 @@ impl<'a> SegmentWriter<'a> {
/// the flushing behavior as a buffer limit
/// - segment: The segment being written
/// - schema
pub fn for_segment(heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema)
-> Result<SegmentWriter<'a>> {
pub fn for_segment(
heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema,
) -> Result<SegmentWriter<'a>> {
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
let tokenizers = schema.fields()
let tokenizers = schema
.fields()
.iter()
.map(|field_entry| field_entry.field_type())
.map(|field_type| {
match field_type {
&FieldType::Str(ref text_options) => {
text_options
.get_indexing_options()
.and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
segment.index().tokenizers().get(tokenizer_name)
})
}
_ => None,
.map(|field_type| match field_type {
&FieldType::Str(ref text_options) => {
text_options.get_indexing_options().and_then(
|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
segment.index().tokenizers().get(tokenizer_name)
},
)
}
_ => None,
})
.collect();
Ok(SegmentWriter {
heap: heap,
max_doc: 0,
multifield_postings: multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
tokenizers: tokenizers,
})
heap: heap,
max_doc: 0,
multifield_postings: multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
tokenizers: tokenizers,
})
}
/// Lay on disk the current content of the `SegmentWriter`
@@ -147,23 +147,25 @@ impl<'a> SegmentWriter<'a> {
FieldType::Str(_) => {
let num_tokens =
if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize] {
let texts: Vec<&str> = field_values.iter()
.flat_map(|field_value| {
match field_value.value() {
&Value::Str(ref text) => Some(text.as_str()),
_ => None
}
let texts: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match field_value.value() {
&Value::Str(ref text) => Some(text.as_str()),
_ => None,
})
.collect();
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
self.multifield_postings.index_text(doc_id, field, &mut token_stream)
}
else {
self.multifield_postings.index_text(
doc_id,
field,
&mut token_stream,
)
} else {
0
};
self.fieldnorms_writer
.get_field_writer(field)
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));
self.fieldnorms_writer.get_field_writer(field).map(
|field_norms_writer| field_norms_writer.add_val(num_tokens as u64),
);
}
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {

View File

@@ -22,9 +22,8 @@ fn posting_from_field_entry<'a>(
match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
text_options
.get_indexing_options()
.map(|indexing_options| {
match indexing_options.index_option() {
.get_indexing_options()
.map(|indexing_options| match indexing_options.index_option() {
IndexRecordOption::Basic => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
@@ -34,11 +33,10 @@ fn posting_from_field_entry<'a>(
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
}
}
})
.unwrap_or_else(|| {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
})
})
.unwrap_or_else(|| {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
})
}
FieldType::U64(_) |
FieldType::I64(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap),
@@ -149,27 +147,29 @@ pub trait PostingsWriter {
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap)
-> io::Result<()>;
fn serialize(
&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
token_stream: &mut TokenStream,
heap: &Heap)
-> u32 {
fn index_text<'a>(
&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
token_stream: &mut TokenStream,
heap: &Heap,
) -> u32 {
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
let mut sink = |token: &Token| {
term.set_text(token.text.as_str());
self.suscribe(term_index, doc_id, token.position as u32, &term, heap);
};
token_stream.process(&mut sink)
}
}
@@ -197,7 +197,6 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn suscribe(
&mut self,
term_index: &mut HashMap,

View File

@@ -509,10 +509,8 @@ mod tests {
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term).unwrap();
let mut block_segments = inverted_index.read_block_postings_from_terminfo(
&term_info,
IndexRecordOption::Basic,
);
let mut block_segments =
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());

View File

@@ -133,9 +133,11 @@ impl<'a> FieldSerializer<'a> {
FieldType::Str(ref text_options) => {
if let Some(ref text_indexing_options) = text_options.get_indexing_options() {
let index_option = text_indexing_options.index_option();
(index_option.is_termfreq_enabled(), index_option.is_position_enabled())
}
else {
(
index_option.is_termfreq_enabled(),
index_option.is_position_enabled(),
)
} else {
(false, false)
}
}

View File

@@ -85,9 +85,11 @@ impl QueryParser {
/// * schema - index Schema
/// * default_fields - fields used to search if no field is specifically defined
/// in the query.
pub fn new(schema: Schema,
default_fields: Vec<Field>,
tokenizer_manager: TokenizerManager) -> QueryParser {
pub fn new(
schema: Schema,
default_fields: Vec<Field>,
tokenizer_manager: TokenizerManager,
) -> QueryParser {
QueryParser {
schema,
default_fields,
@@ -100,12 +102,8 @@ impl QueryParser {
/// * an index
/// * a set of default - fields used to search if no field is specifically defined
/// in the query.
pub fn for_index(index: Index,
default_fields: Vec<Field>) -> QueryParser {
QueryParser::new(
index.schema(),
default_fields,
index.tokenizers().clone())
pub fn for_index(index: Index, default_fields: Vec<Field>) -> QueryParser {
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
}
/// Set the default way to compose queries to a conjunction.
@@ -181,17 +179,20 @@ impl QueryParser {
Ok(Some(LogicalLiteral::Term(term)))
}
FieldType::Str(ref str_options) => {
if let Some(option) = str_options.get_indexing_options() {
let mut tokenizer = self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| {
QueryParserError::UnknownTokenizer(field_entry.name().to_string(), option.tokenizer().to_string())
})?;
if let Some(option) = str_options.get_indexing_options() {
let mut tokenizer = self.tokenizer_manager.get(option.tokenizer()).ok_or_else(
|| {
QueryParserError::UnknownTokenizer(
field_entry.name().to_string(),
option.tokenizer().to_string(),
)
},
)?;
let mut terms: Vec<Term> = Vec::new();
let mut token_stream = tokenizer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.text);
terms.push(term);
let term = Term::from_field_text(field, &token.text);
terms.push(term);
});
if terms.is_empty() {
Ok(None)
@@ -202,10 +203,11 @@ impl QueryParser {
} else {
Ok(Some(LogicalLiteral::Phrase(terms)))
}
}
else {
} else {
// This should have been seen earlier really.
Err(QueryParserError::FieldNotIndexed(field_entry.name().to_string()))
Err(QueryParserError::FieldNotIndexed(
field_entry.name().to_string(),
))
}
}
}
@@ -238,13 +240,11 @@ impl QueryParser {
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
UserInputAST::Not(subquery) => {
let (occur, logical_sub_queries) =
self.compute_logical_ast_with_occur(*subquery)?;
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries))
}
UserInputAST::Must(subquery) => {
let (occur, logical_sub_queries) =
self.compute_logical_ast_with_occur(*subquery)?;
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
}
UserInputAST::Leaf(literal) => {

View File

@@ -46,9 +46,11 @@ impl FieldType {
pub fn get_index_record_option(&self) -> Option<IndexRecordOption> {
match *self {
FieldType::Str(ref text_options) => {
text_options
.get_indexing_options()
.map(|indexing_options| indexing_options.index_option())
text_options.get_indexing_options().map(
|indexing_options| {
indexing_options.index_option()
},
)
}
FieldType::U64(ref int_options) |
FieldType::I64(ref int_options) => {

View File

@@ -5,10 +5,13 @@
/// It is both used to:
///
/// * describe in the schema the amount of information
/// that should be retained during indexing (See [TextFieldIndexing.html.set_index_option](../schema/struct.TextFieldIndexing.html#method.set_index_option))
/// that should be retained during indexing (See
/// [TextFieldIndexing.html.set_index_option](
/// ../schema/struct.TextFieldIndexing.html#method.set_index_option))
/// * to request for a given
/// amount of information to be decoded as one goes through a posting list.
/// (See [InvertedIndexReader.read_postings](../struct.InvertedIndexReader.html#method.read_postings))
/// (See [InvertedIndexReader.read_postings](
/// ../struct.InvertedIndexReader.html#method.read_postings))
///
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)]
pub enum IndexRecordOption {
@@ -63,4 +66,4 @@ impl IndexRecordOption {
IndexRecordOption::WithFreqsAndPositions => true,
}
}
}
}

View File

@@ -40,8 +40,8 @@ let schema = schema_builder.build();
We can split the problem of generating a search result page into two phases :
* identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`)
* for each of these documents, retrieving the information required to generate the search results page.
(`doc_ids[] -> Document[]`)
* for each of these documents, retrieving the information required to generate
the search results page. (`doc_ids[] -> Document[]`)
In the first phase, the ability to search for documents by the given field is determined by the
[`TextIndexingOptions`](enum.TextIndexingOptions.html) of our [`TextOptions`]

View File

@@ -224,22 +224,16 @@ impl Schema {
match *json_value {
JsonValue::Array(ref json_items) => {
for json_item in json_items {
let value =
field_type
.value_from_json(json_item)
.map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
})?;
let value = field_type.value_from_json(json_item).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
})?;
doc.add(FieldValue::new(field, value));
}
}
_ => {
let value =
field_type
.value_from_json(json_value)
.map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
})?;
let value = field_type.value_from_json(json_value).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
})?;
doc.add(FieldValue::new(field, value));
}

View File

@@ -44,7 +44,7 @@ impl Default for TextOptions {
}
#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
pub struct TextFieldIndexing {
record: IndexRecordOption,
tokenizer: Cow<'static, str>,
@@ -88,22 +88,20 @@ impl TextFieldIndexing {
/// The field will be untokenized and indexed
pub const STRING: TextOptions = TextOptions {
indexing: Some(
TextFieldIndexing {
tokenizer: Cow::Borrowed("raw"),
record: IndexRecordOption::Basic,
}),
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("raw"),
record: IndexRecordOption::Basic,
}),
stored: false,
};
/// The field will be tokenized and indexed
pub const TEXT: TextOptions = TextOptions {
indexing: Some(
TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),
record: IndexRecordOption::WithFreqsAndPositions,
}),
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),
record: IndexRecordOption::WithFreqsAndPositions,
}),
stored: false,
};
@@ -149,7 +147,10 @@ mod tests {
match field_entry.field_type() {
&FieldType::Str(ref text_options) => {
assert!(text_options.get_indexing_options().is_some());
assert_eq!(text_options.get_indexing_options().unwrap().tokenizer(), "default");
assert_eq!(
text_options.get_indexing_options().unwrap().tokenizer(),
"default"
);
}
_ => {
panic!("");
@@ -164,5 +165,3 @@ mod tests {
assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic);
}
}

View File

@@ -5,7 +5,8 @@ use super::{TokenFilter, TokenStream, Token};
pub struct LowerCaser;
impl<TailTokenStream> TokenFilter<TailTokenStream> for LowerCaser
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = LowerCaserTokenStream<TailTokenStream>;
@@ -15,7 +16,8 @@ impl<TailTokenStream> TokenFilter<TailTokenStream> for LowerCaser
}
pub struct LowerCaserTokenStream<TailTokenStream>
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
tail: TailTokenStream,
}
@@ -42,7 +44,8 @@ impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
}
impl<TailTokenStream> LowerCaserTokenStream<TailTokenStream>
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
LowerCaserTokenStream { tail: tail }

View File

@@ -163,7 +163,9 @@ mod test {
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.text.clone()); };
en_tokenizer.token_stream("Hello, happy tax payer!").process(&mut add_token);
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);
assert_eq!(&tokens[0], "Hello, happy tax payer!");
@@ -178,7 +180,9 @@ mod test {
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.text.clone()); };
en_tokenizer.token_stream("Hello, happy tax payer!").process(&mut add_token);
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_eq!(&tokens[0], "hello");
@@ -191,11 +195,13 @@ mod test {
fn test_jp_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("ja").unwrap();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.text.clone()); };
en_tokenizer.token_stream("野菜食べないとやばい!").process(&mut add_token);
en_tokenizer
.token_stream("野菜食べないとやばい!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 5);
assert_eq!(&tokens[0], "野菜");

View File

@@ -18,7 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
offset_from: 0,
offset_to: text.len(),
position: 0,
text: text.to_string()
text: text.to_string(),
};
RawTokenStream {
token: token,
@@ -32,8 +32,7 @@ impl TokenStream for RawTokenStream {
if self.has_token {
self.has_token = false;
true
}
else {
} else {
false
}
}

View File

@@ -1,7 +1,7 @@
use super::{TokenFilter, TokenStream, Token};
/// `RemoveLongFilter` removes tokens that are longer
/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
///
/// It is especially useful when indexing unconstrained content.
@@ -19,15 +19,17 @@ impl RemoveLongFilter {
}
impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
fn predicate(&self, token: &Token) -> bool {
token.text.len() < self.token_length_limit
}
fn wrap(token_length_limit: usize,
tail: TailTokenStream)
-> RemoveLongFilterStream<TailTokenStream> {
fn wrap(
token_length_limit: usize,
tail: TailTokenStream,
) -> RemoveLongFilterStream<TailTokenStream> {
RemoveLongFilterStream {
token_length_limit: token_length_limit,
tail: tail,
@@ -37,7 +39,8 @@ impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
@@ -47,7 +50,8 @@ impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
}
pub struct RemoveLongFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
token_length_limit: usize,
tail: TailTokenStream,

View File

@@ -14,7 +14,8 @@ impl Stemmer {
}
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
@@ -26,7 +27,8 @@ impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
pub struct StemmerTokenStream<TailTokenStream>
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
tail: TailTokenStream,
stemmer: rust_stemmers::Stemmer,
@@ -45,7 +47,7 @@ impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
fn advance(&mut self) -> bool {
if self.tail.advance() {
// TODO remove allocation
// TODO remove allocation
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
self.token_mut().text.clear();
self.token_mut().text.push_str(&stemmed_str);
@@ -57,11 +59,13 @@ impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
}
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
where TailTokenStream: TokenStream
where
TailTokenStream: TokenStream,
{
fn wrap(stemmer: rust_stemmers::Stemmer,
tail: TailTokenStream)
-> StemmerTokenStream<TailTokenStream> {
fn wrap(
stemmer: rust_stemmers::Stemmer,
tail: TailTokenStream,
) -> StemmerTokenStream<TailTokenStream> {
StemmerTokenStream {
tail: tail,
stemmer: stemmer,

View File

@@ -9,11 +9,14 @@ pub struct TokenStreamChain<TTokenStream: TokenStream> {
}
impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
where TTokenStream: TokenStream {
pub fn new(offsets: Vec<usize>,
token_streams: Vec<TTokenStream>) -> TokenStreamChain<TTokenStream> {
impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
where
TTokenStream: TokenStream,
{
pub fn new(
offsets: Vec<usize>,
token_streams: Vec<TTokenStream>,
) -> TokenStreamChain<TTokenStream> {
TokenStreamChain {
offsets: offsets,
stream_idx: 0,
@@ -25,7 +28,9 @@ impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
}
impl<'a, TTokenStream> TokenStream for TokenStreamChain<TTokenStream>
where TTokenStream: TokenStream {
where
TTokenStream: TokenStream,
{
fn advance(&mut self) -> bool {
while self.stream_idx < self.token_streams.len() {
let token_stream = &mut self.token_streams[self.stream_idx];
@@ -38,8 +43,7 @@ impl<'a, TTokenStream> TokenStream for TokenStreamChain<TTokenStream>
self.token.text.clear();
self.token.text.push_str(token.text.as_str());
return true;
}
else {
} else {
self.stream_idx += 1;
self.position_shift = self.token.position + 2;
}

View File

@@ -11,7 +11,7 @@ pub struct Token {
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
@@ -43,7 +43,6 @@ impl Default for Token {
///
/// This API may change to use associated types.
pub trait Tokenizer<'a>: Sized + Clone {
/// Type associated to the resulting tokenstream tokenstream.
type TokenStreamImpl: TokenStream;
@@ -71,7 +70,8 @@ pub trait Tokenizer<'a>: Sized + Clone {
/// ```
///
fn filter<NewFilter>(self, new_filter: NewFilter) -> ChainTokenizer<NewFilter, Self>
where NewFilter: TokenFilter<<Self as Tokenizer<'a>>::TokenStreamImpl>
where
NewFilter: TokenFilter<<Self as Tokenizer<'a>>::TokenStreamImpl>,
{
ChainTokenizer {
head: new_filter,
@@ -87,9 +87,14 @@ pub trait BoxedTokenizer: Send + Sync {
}
#[derive(Clone)]
struct BoxableTokenizer<A>(A) where A: for <'a> Tokenizer<'a> + Send + Sync;
struct BoxableTokenizer<A>(A)
where
A: for<'a> Tokenizer<'a> + Send + Sync;
impl<A> BoxedTokenizer for BoxableTokenizer<A> where A: 'static + Send + Sync + for <'a> Tokenizer<'a> {
impl<A> BoxedTokenizer for BoxableTokenizer<A>
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
fn token_stream<'a>(&self, text: &'a str) -> Box<TokenStream + 'a> {
box self.0.token_stream(text)
}
@@ -98,20 +103,15 @@ impl<A> BoxedTokenizer for BoxableTokenizer<A> where A: 'static + Send + Sync +
assert!(texts.len() > 0);
if texts.len() == 1 {
box self.0.token_stream(texts[0])
}
else {
let mut offsets = vec!();
} else {
let mut offsets = vec![];
let mut total_offset = 0;
for &text in texts {
offsets.push(total_offset);
total_offset += text.len();
}
let token_streams: Vec<_> = texts
.iter()
.map(|text| {
self.0.token_stream(text)
})
.collect();
let token_streams: Vec<_> =
texts.iter().map(|text| self.0.token_stream(text)).collect();
box TokenStreamChain::new(offsets, token_streams)
}
}
@@ -122,7 +122,9 @@ impl<A> BoxedTokenizer for BoxableTokenizer<A> where A: 'static + Send + Sync +
}
pub fn box_tokenizer<A>(a: A) -> Box<BoxedTokenizer>
where A: 'static + Send + Sync + for <'a> Tokenizer<'a> {
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
box BoxableTokenizer(a)
}
@@ -211,13 +213,14 @@ pub struct ChainTokenizer<HeadTokenFilterFactory, TailTokenizer> {
impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a>
for ChainTokenizer<HeadTokenFilterFactory, TailTokenizer>
where HeadTokenFilterFactory: TokenFilter<TailTokenizer::TokenStreamImpl>,
TailTokenizer: Tokenizer<'a>
where
HeadTokenFilterFactory: TokenFilter<TailTokenizer::TokenStreamImpl>,
TailTokenizer: Tokenizer<'a>,
{
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
let tail_token_stream = self.tail.token_stream(text );
let tail_token_stream = self.tail.token_stream(text);
self.head.transform(tail_token_stream)
}
}

View File

@@ -18,15 +18,18 @@ use tokenizer::Stemmer;
/// By default, it is populated with the following managers.
///
/// * raw : does not process nor tokenize the text.
/// * default : Chops the text on according to whitespace and punctuation, removes tokens that are too long, lowercases
/// * default : Chops the text on according to whitespace and
/// punctuation, removes tokens that are too long, lowercases
#[derive(Clone)]
pub struct TokenizerManager {
tokenizers: Arc< RwLock<HashMap<String, Box<BoxedTokenizer> >> >
tokenizers: Arc<RwLock<HashMap<String, Box<BoxedTokenizer>>>>,
}
impl TokenizerManager {
pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A)
where A: 'static + Send + Sync + for <'a> Tokenizer<'a> {
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
let boxed_tokenizer = box_tokenizer(tokenizer);
self.tokenizers
.write()
@@ -39,9 +42,7 @@ impl TokenizerManager {
.read()
.expect("Acquiring the lock should never fail")
.get(tokenizer_name)
.map(|boxed_tokenizer| {
boxed_tokenizer.boxed_clone()
})
.map(|boxed_tokenizer| boxed_tokenizer.boxed_clone())
}
}
@@ -52,27 +53,22 @@ impl Default for TokenizerManager {
/// - en_stem
/// - ja
fn default() -> TokenizerManager {
let manager = TokenizerManager {
tokenizers: Arc::new(RwLock::new(HashMap::new()))
};
manager.register("raw",
RawTokenizer
let manager = TokenizerManager { tokenizers: Arc::new(RwLock::new(HashMap::new())) };
manager.register("raw", RawTokenizer);
manager.register(
"default",
SimpleTokenizer.filter(RemoveLongFilter::limit(40)).filter(
LowerCaser,
),
);
manager.register("default",
manager.register(
"en_stem",
SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new()),
);
manager.register("en_stem",
SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new())
);
manager.register("ja",
JapaneseTokenizer
.filter(RemoveLongFilter::limit(40))
);
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));
manager
}
}
}