mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 13:10:41 +00:00
cargo fmt
This commit is contained in:
@@ -36,12 +36,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// Our first field is title.
|
||||
// We want full-text search for it, and we also want
|
||||
// We want full-text search for it, and we also want
|
||||
// to be able to retrieve the document after the search.
|
||||
//
|
||||
//
|
||||
// TEXT | STORED is some syntactic sugar to describe
|
||||
// that.
|
||||
//
|
||||
//
|
||||
// `TEXT` means the field should be tokenized and indexed,
|
||||
// along with its term frequency and term positions.
|
||||
//
|
||||
@@ -52,11 +52,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
|
||||
// Our second field is body.
|
||||
// We want full-text search for it, but we do not
|
||||
// We want full-text search for it, but we do not
|
||||
// need to be able to be able to retrieve it
|
||||
// for our application.
|
||||
// for our application.
|
||||
//
|
||||
// We can make our index lighter and
|
||||
// We can make our index lighter and
|
||||
// by omitting `STORED` flag.
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ pub struct Index {
|
||||
directory: ManagedDirectory,
|
||||
schema: Schema,
|
||||
searcher_pool: Arc<Pool<Searcher>>,
|
||||
tokenizers: TokenizerManager
|
||||
tokenizers: TokenizerManager,
|
||||
}
|
||||
|
||||
|
||||
@@ -259,7 +259,7 @@ impl Clone for Index {
|
||||
directory: self.directory.clone(),
|
||||
schema: self.schema.clone(),
|
||||
searcher_pool: self.searcher_pool.clone(),
|
||||
tokenizers: self.tokenizers.clone()
|
||||
tokenizers: self.tokenizers.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,11 +138,7 @@ impl InvertedIndexReader {
|
||||
/// For instance, requesting `IndexRecordOption::Freq` for a
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> Option<SegmentPostings> {
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
|
||||
@@ -36,7 +36,6 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
}
|
||||
|
||||
impl Segment {
|
||||
|
||||
/// Returns the index the segment belongs to.
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
|
||||
@@ -269,10 +269,8 @@ impl IndexMerger {
|
||||
let field_entry = self.schema.get_field_entry(indexed_field);
|
||||
|
||||
// ... set segment postings option the new field.
|
||||
let segment_postings_option = field_entry
|
||||
.field_type()
|
||||
.get_index_record_option()
|
||||
.expect(
|
||||
let segment_postings_option =
|
||||
field_entry.field_type().get_index_record_option().expect(
|
||||
"Encountered a field that is not supposed to be
|
||||
indexed. Have you modified the schema?",
|
||||
);
|
||||
@@ -405,9 +403,11 @@ mod tests {
|
||||
fn test_index_merger_no_deletes() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs))
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast();
|
||||
@@ -539,9 +539,9 @@ mod tests {
|
||||
fn test_index_merger_with_deletes() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqs))
|
||||
.set_indexing_options(TextFieldIndexing::default().set_index_option(
|
||||
IndexRecordOption::WithFreqs,
|
||||
))
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast();
|
||||
|
||||
@@ -31,7 +31,7 @@ pub struct SegmentWriter<'a> {
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FastFieldsWriter,
|
||||
doc_opstamps: Vec<u64>,
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
|
||||
@@ -57,40 +57,40 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// the flushing behavior as a buffer limit
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema)
|
||||
-> Result<SegmentWriter<'a>> {
|
||||
pub fn for_segment(
|
||||
heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
|
||||
let tokenizers = schema.fields()
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| field_entry.field_type())
|
||||
.map(|field_type| {
|
||||
match field_type {
|
||||
&FieldType::Str(ref text_options) => {
|
||||
text_options
|
||||
.get_indexing_options()
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
segment.index().tokenizers().get(tokenizer_name)
|
||||
})
|
||||
}
|
||||
_ => None,
|
||||
.map(|field_type| match field_type {
|
||||
&FieldType::Str(ref text_options) => {
|
||||
text_options.get_indexing_options().and_then(
|
||||
|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
segment.index().tokenizers().get(tokenizer_name)
|
||||
},
|
||||
)
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
multifield_postings: multifield_postings,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
tokenizers: tokenizers,
|
||||
})
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
multifield_postings: multifield_postings,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
tokenizers: tokenizers,
|
||||
})
|
||||
}
|
||||
|
||||
/// Lay on disk the current content of the `SegmentWriter`
|
||||
@@ -147,23 +147,25 @@ impl<'a> SegmentWriter<'a> {
|
||||
FieldType::Str(_) => {
|
||||
let num_tokens =
|
||||
if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize] {
|
||||
let texts: Vec<&str> = field_values.iter()
|
||||
.flat_map(|field_value| {
|
||||
match field_value.value() {
|
||||
&Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None
|
||||
}
|
||||
let texts: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match field_value.value() {
|
||||
&Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
|
||||
self.multifield_postings.index_text(doc_id, field, &mut token_stream)
|
||||
}
|
||||
else {
|
||||
self.multifield_postings.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
&mut token_stream,
|
||||
)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));
|
||||
self.fieldnorms_writer.get_field_writer(field).map(
|
||||
|field_norms_writer| field_norms_writer.add_val(num_tokens as u64),
|
||||
);
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
|
||||
@@ -22,9 +22,8 @@ fn posting_from_field_entry<'a>(
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
text_options
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| {
|
||||
match indexing_options.index_option() {
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
@@ -34,11 +33,10 @@ fn posting_from_field_entry<'a>(
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
})
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
})
|
||||
}
|
||||
FieldType::U64(_) |
|
||||
FieldType::I64(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap),
|
||||
@@ -149,27 +147,29 @@ pub trait PostingsWriter {
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &Heap)
|
||||
-> u32 {
|
||||
fn index_text<'a>(
|
||||
&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &Heap,
|
||||
) -> u32 {
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
self.suscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
};
|
||||
|
||||
|
||||
token_stream.process(&mut sink)
|
||||
}
|
||||
}
|
||||
@@ -197,7 +197,6 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
|
||||
fn suscribe(
|
||||
&mut self,
|
||||
term_index: &mut HashMap,
|
||||
|
||||
@@ -509,10 +509,8 @@ mod tests {
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
let mut block_segments = inverted_index.read_block_postings_from_terminfo(
|
||||
&term_info,
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let mut block_segments =
|
||||
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
|
||||
@@ -133,9 +133,11 @@ impl<'a> FieldSerializer<'a> {
|
||||
FieldType::Str(ref text_options) => {
|
||||
if let Some(ref text_indexing_options) = text_options.get_indexing_options() {
|
||||
let index_option = text_indexing_options.index_option();
|
||||
(index_option.is_termfreq_enabled(), index_option.is_position_enabled())
|
||||
}
|
||||
else {
|
||||
(
|
||||
index_option.is_termfreq_enabled(),
|
||||
index_option.is_position_enabled(),
|
||||
)
|
||||
} else {
|
||||
(false, false)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,9 +85,11 @@ impl QueryParser {
|
||||
/// * schema - index Schema
|
||||
/// * default_fields - fields used to search if no field is specifically defined
|
||||
/// in the query.
|
||||
pub fn new(schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
tokenizer_manager: TokenizerManager) -> QueryParser {
|
||||
pub fn new(
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
) -> QueryParser {
|
||||
QueryParser {
|
||||
schema,
|
||||
default_fields,
|
||||
@@ -100,12 +102,8 @@ impl QueryParser {
|
||||
/// * an index
|
||||
/// * a set of default - fields used to search if no field is specifically defined
|
||||
/// in the query.
|
||||
pub fn for_index(index: Index,
|
||||
default_fields: Vec<Field>) -> QueryParser {
|
||||
QueryParser::new(
|
||||
index.schema(),
|
||||
default_fields,
|
||||
index.tokenizers().clone())
|
||||
pub fn for_index(index: Index, default_fields: Vec<Field>) -> QueryParser {
|
||||
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
|
||||
}
|
||||
|
||||
/// Set the default way to compose queries to a conjunction.
|
||||
@@ -181,17 +179,20 @@ impl QueryParser {
|
||||
Ok(Some(LogicalLiteral::Term(term)))
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
if let Some(option) = str_options.get_indexing_options() {
|
||||
let mut tokenizer = self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| {
|
||||
QueryParserError::UnknownTokenizer(field_entry.name().to_string(), option.tokenizer().to_string())
|
||||
})?;
|
||||
if let Some(option) = str_options.get_indexing_options() {
|
||||
let mut tokenizer = self.tokenizer_manager.get(option.tokenizer()).ok_or_else(
|
||||
|| {
|
||||
QueryParserError::UnknownTokenizer(
|
||||
field_entry.name().to_string(),
|
||||
option.tokenizer().to_string(),
|
||||
)
|
||||
},
|
||||
)?;
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
let mut token_stream = tokenizer.token_stream(phrase);
|
||||
token_stream.process(&mut |token| {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
terms.push(term);
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
terms.push(term);
|
||||
});
|
||||
if terms.is_empty() {
|
||||
Ok(None)
|
||||
@@ -202,10 +203,11 @@ impl QueryParser {
|
||||
} else {
|
||||
Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// This should have been seen earlier really.
|
||||
Err(QueryParserError::FieldNotIndexed(field_entry.name().to_string()))
|
||||
Err(QueryParserError::FieldNotIndexed(
|
||||
field_entry.name().to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -238,13 +240,11 @@ impl QueryParser {
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
}
|
||||
UserInputAST::Not(subquery) => {
|
||||
let (occur, logical_sub_queries) =
|
||||
self.compute_logical_ast_with_occur(*subquery)?;
|
||||
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Must(subquery) => {
|
||||
let (occur, logical_sub_queries) =
|
||||
self.compute_logical_ast_with_occur(*subquery)?;
|
||||
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Leaf(literal) => {
|
||||
|
||||
@@ -46,9 +46,11 @@ impl FieldType {
|
||||
pub fn get_index_record_option(&self) -> Option<IndexRecordOption> {
|
||||
match *self {
|
||||
FieldType::Str(ref text_options) => {
|
||||
text_options
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| indexing_options.index_option())
|
||||
text_options.get_indexing_options().map(
|
||||
|indexing_options| {
|
||||
indexing_options.index_option()
|
||||
},
|
||||
)
|
||||
}
|
||||
FieldType::U64(ref int_options) |
|
||||
FieldType::I64(ref int_options) => {
|
||||
|
||||
@@ -5,10 +5,13 @@
|
||||
/// It is both used to:
|
||||
///
|
||||
/// * describe in the schema the amount of information
|
||||
/// that should be retained during indexing (See [TextFieldIndexing.html.set_index_option](../schema/struct.TextFieldIndexing.html#method.set_index_option))
|
||||
/// that should be retained during indexing (See
|
||||
/// [TextFieldIndexing.html.set_index_option](
|
||||
/// ../schema/struct.TextFieldIndexing.html#method.set_index_option))
|
||||
/// * to request for a given
|
||||
/// amount of information to be decoded as one goes through a posting list.
|
||||
/// (See [InvertedIndexReader.read_postings](../struct.InvertedIndexReader.html#method.read_postings))
|
||||
/// (See [InvertedIndexReader.read_postings](
|
||||
/// ../struct.InvertedIndexReader.html#method.read_postings))
|
||||
///
|
||||
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum IndexRecordOption {
|
||||
@@ -63,4 +66,4 @@ impl IndexRecordOption {
|
||||
IndexRecordOption::WithFreqsAndPositions => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,8 +40,8 @@ let schema = schema_builder.build();
|
||||
We can split the problem of generating a search result page into two phases :
|
||||
|
||||
* identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`)
|
||||
* for each of these documents, retrieving the information required to generate the search results page.
|
||||
(`doc_ids[] -> Document[]`)
|
||||
* for each of these documents, retrieving the information required to generate
|
||||
the search results page. (`doc_ids[] -> Document[]`)
|
||||
|
||||
In the first phase, the ability to search for documents by the given field is determined by the
|
||||
[`TextIndexingOptions`](enum.TextIndexingOptions.html) of our [`TextOptions`]
|
||||
|
||||
@@ -224,22 +224,16 @@ impl Schema {
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
let value =
|
||||
field_type
|
||||
.value_from_json(json_item)
|
||||
.map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
})?;
|
||||
let value = field_type.value_from_json(json_item).map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
})?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value =
|
||||
field_type
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
})?;
|
||||
let value = field_type.value_from_json(json_value).map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
})?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ impl Default for TextOptions {
|
||||
}
|
||||
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
|
||||
pub struct TextFieldIndexing {
|
||||
record: IndexRecordOption,
|
||||
tokenizer: Cow<'static, str>,
|
||||
@@ -88,22 +88,20 @@ impl TextFieldIndexing {
|
||||
|
||||
/// The field will be untokenized and indexed
|
||||
pub const STRING: TextOptions = TextOptions {
|
||||
indexing: Some(
|
||||
TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("raw"),
|
||||
record: IndexRecordOption::Basic,
|
||||
}),
|
||||
indexing: Some(TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("raw"),
|
||||
record: IndexRecordOption::Basic,
|
||||
}),
|
||||
stored: false,
|
||||
};
|
||||
|
||||
|
||||
/// The field will be tokenized and indexed
|
||||
pub const TEXT: TextOptions = TextOptions {
|
||||
indexing: Some(
|
||||
TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("default"),
|
||||
record: IndexRecordOption::WithFreqsAndPositions,
|
||||
}),
|
||||
indexing: Some(TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("default"),
|
||||
record: IndexRecordOption::WithFreqsAndPositions,
|
||||
}),
|
||||
stored: false,
|
||||
};
|
||||
|
||||
@@ -149,7 +147,10 @@ mod tests {
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Str(ref text_options) => {
|
||||
assert!(text_options.get_indexing_options().is_some());
|
||||
assert_eq!(text_options.get_indexing_options().unwrap().tokenizer(), "default");
|
||||
assert_eq!(
|
||||
text_options.get_indexing_options().unwrap().tokenizer(),
|
||||
"default"
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
panic!("");
|
||||
@@ -164,5 +165,3 @@ mod tests {
|
||||
assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@ use super::{TokenFilter, TokenStream, Token};
|
||||
pub struct LowerCaser;
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for LowerCaser
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = LowerCaserTokenStream<TailTokenStream>;
|
||||
|
||||
@@ -15,7 +16,8 @@ impl<TailTokenStream> TokenFilter<TailTokenStream> for LowerCaser
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
@@ -42,7 +44,8 @@ impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
||||
}
|
||||
|
||||
impl<TailTokenStream> LowerCaserTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
|
||||
LowerCaserTokenStream { tail: tail }
|
||||
|
||||
@@ -163,7 +163,9 @@ mod test {
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.text.clone()); };
|
||||
en_tokenizer.token_stream("Hello, happy tax payer!").process(&mut add_token);
|
||||
en_tokenizer
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 1);
|
||||
assert_eq!(&tokens[0], "Hello, happy tax payer!");
|
||||
@@ -178,7 +180,9 @@ mod test {
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.text.clone()); };
|
||||
en_tokenizer.token_stream("Hello, happy tax payer!").process(&mut add_token);
|
||||
en_tokenizer
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_eq!(&tokens[0], "hello");
|
||||
@@ -191,11 +195,13 @@ mod test {
|
||||
fn test_jp_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let en_tokenizer = tokenizer_manager.get("ja").unwrap();
|
||||
|
||||
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.text.clone()); };
|
||||
en_tokenizer.token_stream("野菜食べないとやばい!").process(&mut add_token);
|
||||
en_tokenizer
|
||||
.token_stream("野菜食べないとやばい!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 5);
|
||||
assert_eq!(&tokens[0], "野菜");
|
||||
|
||||
@@ -18,7 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
|
||||
offset_from: 0,
|
||||
offset_to: text.len(),
|
||||
position: 0,
|
||||
text: text.to_string()
|
||||
text: text.to_string(),
|
||||
};
|
||||
RawTokenStream {
|
||||
token: token,
|
||||
@@ -32,8 +32,7 @@ impl TokenStream for RawTokenStream {
|
||||
if self.has_token {
|
||||
self.has_token = false;
|
||||
true
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::{TokenFilter, TokenStream, Token};
|
||||
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// than a given number of bytes (in UTF-8 representation).
|
||||
///
|
||||
/// It is especially useful when indexing unconstrained content.
|
||||
@@ -19,15 +19,17 @@ impl RemoveLongFilter {
|
||||
}
|
||||
|
||||
impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.len() < self.token_length_limit
|
||||
}
|
||||
|
||||
fn wrap(token_length_limit: usize,
|
||||
tail: TailTokenStream)
|
||||
-> RemoveLongFilterStream<TailTokenStream> {
|
||||
fn wrap(
|
||||
token_length_limit: usize,
|
||||
tail: TailTokenStream,
|
||||
) -> RemoveLongFilterStream<TailTokenStream> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit: token_length_limit,
|
||||
tail: tail,
|
||||
@@ -37,7 +39,8 @@ impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
|
||||
|
||||
@@ -47,7 +50,8 @@ impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
|
||||
}
|
||||
|
||||
pub struct RemoveLongFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
token_length_limit: usize,
|
||||
tail: TailTokenStream,
|
||||
|
||||
@@ -14,7 +14,8 @@ impl Stemmer {
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||
|
||||
@@ -26,7 +27,8 @@ impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
|
||||
|
||||
|
||||
pub struct StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
stemmer: rust_stemmers::Stemmer,
|
||||
@@ -45,7 +47,7 @@ impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tail.advance() {
|
||||
// TODO remove allocation
|
||||
// TODO remove allocation
|
||||
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
|
||||
self.token_mut().text.clear();
|
||||
self.token_mut().text.push_str(&stemmed_str);
|
||||
@@ -57,11 +59,13 @@ impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
}
|
||||
|
||||
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn wrap(stemmer: rust_stemmers::Stemmer,
|
||||
tail: TailTokenStream)
|
||||
-> StemmerTokenStream<TailTokenStream> {
|
||||
fn wrap(
|
||||
stemmer: rust_stemmers::Stemmer,
|
||||
tail: TailTokenStream,
|
||||
) -> StemmerTokenStream<TailTokenStream> {
|
||||
StemmerTokenStream {
|
||||
tail: tail,
|
||||
stemmer: stemmer,
|
||||
|
||||
@@ -9,11 +9,14 @@ pub struct TokenStreamChain<TTokenStream: TokenStream> {
|
||||
}
|
||||
|
||||
|
||||
impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
|
||||
where TTokenStream: TokenStream {
|
||||
|
||||
pub fn new(offsets: Vec<usize>,
|
||||
token_streams: Vec<TTokenStream>) -> TokenStreamChain<TTokenStream> {
|
||||
impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
|
||||
where
|
||||
TTokenStream: TokenStream,
|
||||
{
|
||||
pub fn new(
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<TTokenStream>,
|
||||
) -> TokenStreamChain<TTokenStream> {
|
||||
TokenStreamChain {
|
||||
offsets: offsets,
|
||||
stream_idx: 0,
|
||||
@@ -25,7 +28,9 @@ impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
|
||||
}
|
||||
|
||||
impl<'a, TTokenStream> TokenStream for TokenStreamChain<TTokenStream>
|
||||
where TTokenStream: TokenStream {
|
||||
where
|
||||
TTokenStream: TokenStream,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.stream_idx < self.token_streams.len() {
|
||||
let token_stream = &mut self.token_streams[self.stream_idx];
|
||||
@@ -38,8 +43,7 @@ impl<'a, TTokenStream> TokenStream for TokenStreamChain<TTokenStream>
|
||||
self.token.text.clear();
|
||||
self.token.text.push_str(token.text.as_str());
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
self.stream_idx += 1;
|
||||
self.position_shift = self.token.position + 2;
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ pub struct Token {
|
||||
/// Offsets shall not be modified by token filters.
|
||||
pub offset_from: usize,
|
||||
/// Offset (byte index) of the last character of the token + 1.
|
||||
/// The text that generated the token should be obtained by
|
||||
/// The text that generated the token should be obtained by
|
||||
/// &text[token.offset_from..token.offset_to]
|
||||
pub offset_to: usize,
|
||||
/// Position, expressed in number of tokens.
|
||||
@@ -43,7 +43,6 @@ impl Default for Token {
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer<'a>: Sized + Clone {
|
||||
|
||||
/// Type associated to the resulting tokenstream tokenstream.
|
||||
type TokenStreamImpl: TokenStream;
|
||||
|
||||
@@ -71,7 +70,8 @@ pub trait Tokenizer<'a>: Sized + Clone {
|
||||
/// ```
|
||||
///
|
||||
fn filter<NewFilter>(self, new_filter: NewFilter) -> ChainTokenizer<NewFilter, Self>
|
||||
where NewFilter: TokenFilter<<Self as Tokenizer<'a>>::TokenStreamImpl>
|
||||
where
|
||||
NewFilter: TokenFilter<<Self as Tokenizer<'a>>::TokenStreamImpl>,
|
||||
{
|
||||
ChainTokenizer {
|
||||
head: new_filter,
|
||||
@@ -87,9 +87,14 @@ pub trait BoxedTokenizer: Send + Sync {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct BoxableTokenizer<A>(A) where A: for <'a> Tokenizer<'a> + Send + Sync;
|
||||
struct BoxableTokenizer<A>(A)
|
||||
where
|
||||
A: for<'a> Tokenizer<'a> + Send + Sync;
|
||||
|
||||
impl<A> BoxedTokenizer for BoxableTokenizer<A> where A: 'static + Send + Sync + for <'a> Tokenizer<'a> {
|
||||
impl<A> BoxedTokenizer for BoxableTokenizer<A>
|
||||
where
|
||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<TokenStream + 'a> {
|
||||
box self.0.token_stream(text)
|
||||
}
|
||||
@@ -98,20 +103,15 @@ impl<A> BoxedTokenizer for BoxableTokenizer<A> where A: 'static + Send + Sync +
|
||||
assert!(texts.len() > 0);
|
||||
if texts.len() == 1 {
|
||||
box self.0.token_stream(texts[0])
|
||||
}
|
||||
else {
|
||||
let mut offsets = vec!();
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &text in texts {
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
}
|
||||
let token_streams: Vec<_> = texts
|
||||
.iter()
|
||||
.map(|text| {
|
||||
self.0.token_stream(text)
|
||||
})
|
||||
.collect();
|
||||
let token_streams: Vec<_> =
|
||||
texts.iter().map(|text| self.0.token_stream(text)).collect();
|
||||
box TokenStreamChain::new(offsets, token_streams)
|
||||
}
|
||||
}
|
||||
@@ -122,7 +122,9 @@ impl<A> BoxedTokenizer for BoxableTokenizer<A> where A: 'static + Send + Sync +
|
||||
}
|
||||
|
||||
pub fn box_tokenizer<A>(a: A) -> Box<BoxedTokenizer>
|
||||
where A: 'static + Send + Sync + for <'a> Tokenizer<'a> {
|
||||
where
|
||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
box BoxableTokenizer(a)
|
||||
}
|
||||
|
||||
@@ -211,13 +213,14 @@ pub struct ChainTokenizer<HeadTokenFilterFactory, TailTokenizer> {
|
||||
|
||||
impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a>
|
||||
for ChainTokenizer<HeadTokenFilterFactory, TailTokenizer>
|
||||
where HeadTokenFilterFactory: TokenFilter<TailTokenizer::TokenStreamImpl>,
|
||||
TailTokenizer: Tokenizer<'a>
|
||||
where
|
||||
HeadTokenFilterFactory: TokenFilter<TailTokenizer::TokenStreamImpl>,
|
||||
TailTokenizer: Tokenizer<'a>,
|
||||
{
|
||||
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
|
||||
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let tail_token_stream = self.tail.token_stream(text );
|
||||
let tail_token_stream = self.tail.token_stream(text);
|
||||
self.head.transform(tail_token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,15 +18,18 @@ use tokenizer::Stemmer;
|
||||
/// By default, it is populated with the following managers.
|
||||
///
|
||||
/// * raw : does not process nor tokenize the text.
|
||||
/// * default : Chops the text on according to whitespace and punctuation, removes tokens that are too long, lowercases
|
||||
/// * default : Chops the text on according to whitespace and
|
||||
/// punctuation, removes tokens that are too long, lowercases
|
||||
#[derive(Clone)]
|
||||
pub struct TokenizerManager {
|
||||
tokenizers: Arc< RwLock<HashMap<String, Box<BoxedTokenizer> >> >
|
||||
tokenizers: Arc<RwLock<HashMap<String, Box<BoxedTokenizer>>>>,
|
||||
}
|
||||
|
||||
impl TokenizerManager {
|
||||
pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A)
|
||||
where A: 'static + Send + Sync + for <'a> Tokenizer<'a> {
|
||||
where
|
||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
let boxed_tokenizer = box_tokenizer(tokenizer);
|
||||
self.tokenizers
|
||||
.write()
|
||||
@@ -39,9 +42,7 @@ impl TokenizerManager {
|
||||
.read()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
.get(tokenizer_name)
|
||||
.map(|boxed_tokenizer| {
|
||||
boxed_tokenizer.boxed_clone()
|
||||
})
|
||||
.map(|boxed_tokenizer| boxed_tokenizer.boxed_clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,27 +53,22 @@ impl Default for TokenizerManager {
|
||||
/// - en_stem
|
||||
/// - ja
|
||||
fn default() -> TokenizerManager {
|
||||
let manager = TokenizerManager {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new()))
|
||||
};
|
||||
manager.register("raw",
|
||||
RawTokenizer
|
||||
let manager = TokenizerManager { tokenizers: Arc::new(RwLock::new(HashMap::new())) };
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register(
|
||||
"default",
|
||||
SimpleTokenizer.filter(RemoveLongFilter::limit(40)).filter(
|
||||
LowerCaser,
|
||||
),
|
||||
);
|
||||
manager.register("default",
|
||||
manager.register(
|
||||
"en_stem",
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new()),
|
||||
);
|
||||
manager.register("en_stem",
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new())
|
||||
);
|
||||
manager.register("ja",
|
||||
JapaneseTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
);
|
||||
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));
|
||||
manager
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user