Cargo Format (#420)

This commit is contained in:
Paul Masurel
2018-09-15 07:44:22 +09:00
committed by GitHub
parent 0ba1cf93f7
commit 37e4280c0a
71 changed files with 697 additions and 650 deletions

View File

@@ -10,7 +10,6 @@
// - search for the best document matchings "sea whale"
// - retrieve the best document original content.
extern crate tempdir;
// ---
@@ -235,9 +234,7 @@ fn main() -> tantivy::Result<()> {
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())
}
use tempdir::TempDir;

View File

@@ -3,7 +3,6 @@
// In this example, we'll see how to define a tokenizer pipeline
// by aligning a bunch of `TokenFilter`.
#[macro_use]
extern crate tantivy;
use tantivy::collector::TopCollector;
@@ -12,7 +11,6 @@ use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
//

View File

@@ -11,10 +11,9 @@
#[macro_use]
extern crate tantivy;
use tantivy::collector::TopCollector;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::Index;
use tantivy::query::TermQuery;
// A simple helper function to fetch a single document
// given its id from our index.
@@ -31,7 +30,7 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Op
let mut top_collector = TopCollector::with_limit(1);
searcher.search(&term_query, &mut top_collector)?;
if let Some(doc_address) = top_collector.docs().first() {
if let Some(doc_address) = top_collector.docs().first() {
let doc = searcher.doc(doc_address)?;
Ok(Some(doc))
} else {
@@ -41,7 +40,6 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Op
}
fn main() -> tantivy::Result<()> {
// # Defining the schema
//
// Check out the *basic_search* example if this makes
@@ -126,7 +124,6 @@ fn main() -> tantivy::Result<()> {
isbn => "978-9176370711",
));
// You are guaranteed that your clients will only observe your index in
// the state it was in after a commit.
// In this example, your search engine will at no point be missing the *Frankenstein* document.
@@ -143,4 +140,4 @@ fn main() -> tantivy::Result<()> {
);
Ok(())
}
}

View File

@@ -22,60 +22,60 @@ use tantivy::schema::*;
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the
// sake of this example
let index_path = TempDir::new("tantivy_facet_example_dir")?;
let mut schema_builder = SchemaBuilder::default();
// Let's create a temporary directory for the
// sake of this example
let index_path = TempDir::new("tantivy_facet_example_dir")?;
let mut schema_builder = SchemaBuilder::default();
schema_builder.add_text_field("name", TEXT | STORED);
schema_builder.add_text_field("name", TEXT | STORED);
// this is our faceted field
schema_builder.add_facet_field("tags");
// this is our faceted field
schema_builder.add_facet_field("tags");
let schema = schema_builder.build();
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_path, schema.clone())?;
let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let name = schema.get_field("name").unwrap();
let tags = schema.get_field("tags").unwrap();
let name = schema.get_field("name").unwrap();
let tags = schema.get_field("tags").unwrap();
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
index_writer.add_document(doc!(
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
index_writer.add_document(doc!(
name => "the ditch",
tags => Facet::from("/pools/north")
));
index_writer.add_document(doc!(
index_writer.add_document(doc!(
name => "little stacey",
tags => Facet::from("/pools/south")
));
index_writer.commit()?;
index_writer.commit()?;
index.load_searchers()?;
index.load_searchers()?;
let searcher = index.searcher();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(tags);
facet_collector.add_facet("/pools");
let mut facet_collector = FacetCollector::for_field(tags);
facet_collector.add_facet("/pools");
searcher.search(&AllQuery, &mut facet_collector).unwrap();
searcher.search(&AllQuery, &mut facet_collector).unwrap();
let counts = facet_collector.harvest();
// This lists all of the facet counts
let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect();
assert_eq!(
facets,
vec![
(&Facet::from("/pools/north"), 1),
(&Facet::from("/pools/south"), 1)
]
);
let counts = facet_collector.harvest();
// This lists all of the facet counts
let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect();
assert_eq!(
facets,
vec![
(&Facet::from("/pools/north"), 1),
(&Facet::from("/pools/south"), 1),
]
);
Ok(())
Ok(())
}
use tempdir::TempDir;

View File

@@ -7,18 +7,15 @@
// the list of documents containing a term, getting
// its term frequency, and accessing its positions.
// ---
// Importing tantivy...
#[macro_use]
extern crate tantivy;
use tantivy::schema::*;
use tantivy::Index;
use tantivy::{DocSet, DocId, Postings};
use tantivy::{DocId, DocSet, Postings};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the
// example. Check the `basic_search` example for more information.
let mut schema_builder = SchemaBuilder::default();
@@ -47,7 +44,6 @@ fn main() -> tantivy::Result<()> {
// there is actually only one segment here, but let's iterate through the list
// anyway)
for segment_reader in searcher.segment_readers() {
// A segment contains different data structure.
// Inverted index stands for the combination of
// - the term dictionary
@@ -58,19 +54,18 @@ fn main() -> tantivy::Result<()> {
// Let's go through all docs containing the term `title:the` and access their position
let term_the = Term::from_field_text(title, "the");
// This segment posting object is like a cursor over the documents matching the term.
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
// and positions.
//
// If you don't need all this information, you may get better performance by decompressing less
// information.
if let Some(mut segment_postings) = inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) {
if let Some(mut segment_postings) =
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)
{
// this buffer will be used to request for positions
let mut positions: Vec<u32> = Vec::with_capacity(100);
while segment_postings.advance() {
// the number of time the term appears in the document.
let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.
@@ -98,7 +93,6 @@ fn main() -> tantivy::Result<()> {
}
}
// A `Term` is a text token associated with a field.
// Let's go through all docs containing the term `title:the` and access their position
let term_the = Term::from_field_text(title, "the");
@@ -111,7 +105,6 @@ fn main() -> tantivy::Result<()> {
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
// you can get better performance by accessing directly the blocks of doc ids.
for segment_reader in searcher.segment_readers() {
// A segment contains different data structure.
// Inverted index stands for the combination of
// - the term dictionary
@@ -124,7 +117,9 @@ fn main() -> tantivy::Result<()> {
//
// If you don't need all this information, you may get better performance by decompressing less
// information.
if let Some(mut block_segment_postings) = inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) {
if let Some(mut block_segment_postings) =
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
{
while block_segment_postings.advance() {
// Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs();
@@ -136,4 +131,3 @@ fn main() -> tantivy::Result<()> {
Ok(())
}

View File

@@ -66,6 +66,6 @@ fn main() -> tantivy::Result<()> {
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
println!("snippet: {}", snippet.to_html());
}
Ok(())
}

View File

@@ -22,59 +22,59 @@ use tantivy::tokenizer::*;
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search`
let mut schema_builder = SchemaBuilder::default();
// this example assumes you understand the content in `basic_search`
let mut schema_builder = SchemaBuilder::default();
// This configures your custom options for how tantivy will
// store and process your content in the index; The key
// to note is that we are setting the tokenizer to `stoppy`
// which will be defined and registered below.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
// This configures your custom options for how tantivy will
// store and process your content in the index; The key
// to note is that we are setting the tokenizer to `stoppy`
// which will be defined and registered below.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
// Our first field is title.
schema_builder.add_text_field("title", text_options);
// Our first field is title.
schema_builder.add_text_field("title", text_options);
// Our second field is body.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("body", text_options);
// Our second field is body.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("body", text_options);
let schema = schema_builder.build();
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema.clone());
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
index.tokenizers().register("stoppy", tokenizer);
index.tokenizers().register("stoppy", tokenizer);
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
index_writer.add_document(doc!(
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
index_writer.add_document(doc!(
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
@@ -86,7 +86,7 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
index_writer.add_document(doc!(
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
@@ -94,28 +94,28 @@ fn main() -> tantivy::Result<()> {
increasing confidence in the success of my undertaking."
));
index_writer.commit()?;
index_writer.commit()?;
index.load_searchers()?;
index.load_searchers()?;
let searcher = index.searcher();
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// stop words are applied on the query as well.
// The following will be equivalent to `title:frankenstein`
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
// stop words are applied on the query as well.
// The following will be equivalent to `title:frankenstein`
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
let mut top_collector = TopCollector::with_limit(10);
let mut top_collector = TopCollector::with_limit(10);
searcher.search(&*query, &mut top_collector)?;
searcher.search(&*query, &mut top_collector)?;
let doc_addresses = top_collector.docs();
let doc_addresses = top_collector.docs();
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(&doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(&doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())
}
Ok(())
}

View File

@@ -342,16 +342,19 @@ impl FacetCollector {
pub fn harvest(mut self) -> FacetCounts {
self.finalize_segment();
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
let collapsed_facet_ords: Vec<&[u64]> = self
.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_ords[..])
.collect();
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
let collapsed_facet_counts: Vec<&[u64]> = self
.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_counts[..])
.collect();
let facet_streams = self.segment_counters
let facet_streams = self
.segment_counters
.iter()
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
.collect::<Vec<_>>();
@@ -402,7 +405,8 @@ impl Collector for FacetCollector {
fn collect(&mut self, doc: DocId, _: Score) {
let facet_reader: &mut FacetReader = unsafe {
&mut *self.ff_reader
&mut *self
.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.get()
@@ -476,9 +480,8 @@ impl FacetCounts {
heap.push(Hit { count, facet });
}
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count)
.unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value
// is never used in that case.
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value
// is never used in that case.
for (facet, count) in it {
if count > lowest_count {
@@ -619,7 +622,13 @@ mod tests {
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| { doc.add_facet(facet_field, &format!("/facet/{}", thread_rng().sample(&uniform) )); doc})
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(&uniform)),
);
doc
})
.collect();
thread_rng().shuffle(&mut docs[..]);

View File

@@ -1,8 +1,8 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use DocAddress;
use DocId;
use SegmentLocalId;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
/// Contains a feature (field, score, etc.) of a document along with the document address.
///
@@ -139,9 +139,9 @@ impl<T: PartialOrd + Clone> TopCollector<T> {
#[cfg(test)]
mod tests {
use super::*;
use DocId;
use Score;
use super::*;
#[test]
fn test_top_collector_not_at_capacity() {

View File

@@ -1,13 +1,13 @@
use super::Collector;
use collector::top_collector::TopCollector;
use DocAddress;
use DocId;
use fastfield::FastFieldReader;
use fastfield::FastValue;
use schema::Field;
use DocAddress;
use DocId;
use Result;
use Score;
use SegmentReader;
use super::Collector;
use schema::Field;
/// The Top Field Collector keeps track of the K documents
/// sorted by a fast field in the index
@@ -142,16 +142,16 @@ impl<T: FastValue + PartialOrd + Clone> Collector for TopFieldCollector<T> {
#[cfg(test)]
mod tests {
use Index;
use IndexWriter;
use TantivyError;
use super::*;
use query::Query;
use query::QueryParser;
use schema::{FAST, SchemaBuilder, TEXT};
use schema::Field;
use schema::IntOptions;
use schema::Schema;
use super::*;
use schema::{SchemaBuilder, FAST, TEXT};
use Index;
use IndexWriter;
use TantivyError;
const TITLE: &str = "title";
const SIZE: &str = "size";

View File

@@ -1,3 +1,4 @@
use super::Collector;
use collector::top_collector::TopCollector;
use DocAddress;
use DocId;
@@ -5,7 +6,6 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use super::Collector;
/// The Top Score Collector keeps track of the K documents
/// sorted by their score.
@@ -131,10 +131,10 @@ impl Collector for TopScoreCollector {
#[cfg(test)]
mod tests {
use super::*;
use collector::Collector;
use DocId;
use Score;
use super::*;
#[test]
fn test_top_collector_not_at_capacity() {

View File

@@ -72,7 +72,8 @@ impl<W: Write> CompositeWrite<W> {
let footer_offset = self.write.written_bytes();
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
let mut offset_fields: Vec<_> = self.offsets
let mut offset_fields: Vec<_> = self
.offsets
.iter()
.map(|(file_addr, offset)| (*offset, *file_addr))
.collect();

View File

@@ -10,8 +10,6 @@ pub struct VInt(pub u64);
const STOP_BIT: u8 = 128;
impl VInt {
pub fn val(&self) -> u64 {
self.0
}
@@ -20,14 +18,13 @@ impl VInt {
VInt::deserialize(reader).map(|vint| vint.0)
}
pub fn serialize_into_vec(&self, output: &mut Vec<u8>){
pub fn serialize_into_vec(&self, output: &mut Vec<u8>) {
let mut buffer = [0u8; 10];
let num_bytes = self.serialize_into(&mut buffer);
output.extend(&buffer[0..num_bytes]);
}
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
let mut remaining = self.0;
for (i, b) in buffer.iter_mut().enumerate() {
let next_byte: u8 = (remaining % 128u64) as u8;
@@ -74,7 +71,6 @@ impl BinarySerializable for VInt {
}
}
#[cfg(test)]
mod tests {
@@ -89,10 +85,10 @@ mod tests {
}
assert!(num_bytes > 0);
if num_bytes < 10 {
assert!(1u64 << (7*num_bytes) > val);
assert!(1u64 << (7 * num_bytes) > val);
}
if num_bytes > 1 {
assert!(1u64 << (7*(num_bytes-1)) <= val);
assert!(1u64 << (7 * (num_bytes - 1)) <= val);
}
let serdeser_val = VInt::deserialize(&mut &v[..]).unwrap();
assert_eq!(val, serdeser_val.0);
@@ -105,11 +101,11 @@ mod tests {
aux_test_vint(5);
aux_test_vint(u64::max_value());
for i in 1..9 {
let power_of_128 = 1u64 << (7*i);
let power_of_128 = 1u64 << (7 * i);
aux_test_vint(power_of_128 - 1u64);
aux_test_vint(power_of_128 );
aux_test_vint(power_of_128);
aux_test_vint(power_of_128 + 1u64);
}
aux_test_vint(10);
}
}
}

View File

@@ -1,36 +1,36 @@
use core::SegmentId;
use error::TantivyError;
use schema::Schema;
use serde_json;
use std::borrow::BorrowMut;
use std::fmt;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use Result;
use indexer::LockType;
use super::pool::LeasedItem;
use super::pool::Pool;
use super::segment::create_segment;
use super::segment::Segment;
use core::searcher::Searcher;
use core::IndexMeta;
use core::SegmentId;
use core::SegmentMeta;
use core::SegmentReader;
use core::META_FILEPATH;
use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use directory::{ManagedDirectory};
use error::TantivyError;
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
use indexer::segment_updater::save_new_metas;
use indexer::LockType;
use num_cpus;
use schema::Field;
use schema::FieldType;
use schema::Schema;
use serde_json;
use std::borrow::BorrowMut;
use std::fmt;
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use tokenizer::BoxedTokenizer;
use tokenizer::TokenizerManager;
use IndexWriter;
use schema::FieldType;
use schema::Field;
use tokenizer::BoxedTokenizer;
use Result;
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
@@ -115,31 +115,24 @@ impl Index {
&self.tokenizers
}
/// Helper to access the tokenizer associated to a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> =
match field_type {
FieldType::Str(text_options) => {
text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
},
_ => {
None
}
};
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> = match field_type {
FieldType::Str(text_options) => text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
_ => None,
};
match tokenizer_name_opt {
Some(tokenizer) => {
Ok(tokenizer)
}
None => {
Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name())))
}
Some(tokenizer) => Ok(tokenizer),
None => Err(TantivyError::SchemaError(format!(
"{:?} is not a text field.",
field_entry.name()
))),
}
}
@@ -186,7 +179,6 @@ impl Index {
num_threads: usize,
overall_heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
open_index_writer(
@@ -225,7 +217,8 @@ impl Index {
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self.searchable_segment_metas()?
Ok(self
.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect())
@@ -260,7 +253,8 @@ impl Index {
/// Returns the list of segment ids that are searchable.
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self.searchable_segment_metas()?
Ok(self
.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect())
@@ -332,11 +326,10 @@ impl Clone for Index {
}
}
#[cfg(test)]
mod tests {
use schema::{SchemaBuilder, INT_INDEXED, TEXT};
use Index;
use schema::{SchemaBuilder, TEXT, INT_INDEXED};
#[test]
fn test_indexer_for_field() {
@@ -352,5 +345,4 @@ mod tests {
);
}
}
}

View File

@@ -1,13 +1,13 @@
use common::BinarySerializable;
use directory::ReadOnlySource;
use owned_read::OwnedRead;
use positions::PositionReader;
use postings::TermInfo;
use postings::{BlockSegmentPostings, SegmentPostings};
use schema::FieldType;
use schema::IndexRecordOption;
use schema::Term;
use termdict::TermDictionary;
use owned_read::OwnedRead;
use positions::PositionReader;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
@@ -100,7 +100,6 @@ impl InvertedIndexReader {
block_postings.reset(term_info.doc_freq, postings_reader);
}
/// Returns a block postings given a `Term`.
/// This method is for an advanced usage only.
///
@@ -111,7 +110,7 @@ impl InvertedIndexReader {
option: IndexRecordOption,
) -> Option<BlockSegmentPostings> {
self.get_term_info(term)
.map(move|term_info| self.read_block_postings_from_terminfo(&term_info, option))
.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
}
/// Returns a block postings given a `term_info`.
@@ -147,7 +146,8 @@ impl InvertedIndexReader {
if option.has_positions() {
let position_reader = self.positions_source.clone();
let skip_reader = self.positions_idx_source.clone();
let position_reader = PositionReader::new(position_reader, skip_reader, term_info.positions_idx);
let position_reader =
PositionReader::new(position_reader, skip_reader, term_info.positions_idx);
Some(position_reader)
} else {
None

View File

@@ -87,7 +87,8 @@ impl<T> Deref for LeasedItem<T> {
type Target = T;
fn deref(&self) -> &T {
&self.gen_item
&self
.gen_item
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
@@ -96,7 +97,8 @@ impl<T> Deref for LeasedItem<T> {
impl<T> DerefMut for LeasedItem<T> {
fn deref_mut(&mut self) -> &mut T {
&mut self.gen_item
&mut self
.gen_item
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here

View File

@@ -9,8 +9,8 @@ use std::fmt;
use std::sync::Arc;
use termdict::TermMerger;
use DocAddress;
use Result;
use Index;
use Result;
/// Holds a list of `SegmentReader`s ready for search.
///
@@ -25,7 +25,11 @@ pub struct Searcher {
impl Searcher {
/// Creates a new `Searcher`
pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec<SegmentReader>) -> Searcher {
pub(crate) fn new(
schema: Schema,
index: Index,
segment_readers: Vec<SegmentReader>,
) -> Searcher {
Searcher {
schema,
index,
@@ -87,7 +91,8 @@ impl Searcher {
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> FieldSearcher {
let inv_index_readers = self.segment_readers
let inv_index_readers = self
.segment_readers
.iter()
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<Vec<_>>();
@@ -107,7 +112,8 @@ impl FieldSearcher {
/// Returns a Stream over all of the sorted unique terms of
/// for the given field.
pub fn terms(&self) -> TermMerger {
let term_streamers: Vec<_> = self.inv_index_readers
let term_streamers: Vec<_> = self
.inv_index_readers
.iter()
.map(|inverted_index| inverted_index.terms().stream())
.collect();
@@ -117,7 +123,8 @@ impl FieldSearcher {
impl fmt::Debug for Searcher {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let segment_ids = self.segment_readers
let segment_ids = self
.segment_readers
.iter()
.map(|segment_reader| segment_reader.segment_id())
.collect::<Vec<_>>();

View File

@@ -157,11 +157,13 @@ impl SegmentReader {
&FieldType::Bytes => {}
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
}
let idx_reader = self.fast_fields_composite
let idx_reader = self
.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let values = self.fast_fields_composite
let values = self
.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
Ok(BytesFastFieldReader::open(idx_reader, values))
@@ -285,7 +287,8 @@ impl SegmentReader {
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
if let Some(inv_idx_reader) = self
.inv_idx_reader_cache
.read()
.expect("Lock poisoned. This should never happen")
.get(&field)
@@ -314,15 +317,18 @@ impl SegmentReader {
let postings_source = postings_source_opt.unwrap();
let termdict_source = self.termdict_composite
let termdict_source = self
.termdict_composite
.open_read(field)
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
let positions_source = self.positions_composite
let positions_source = self
.positions_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
let positions_idx_source = self.positions_idx_composite
let positions_idx_source = self
.positions_idx_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
@@ -435,7 +441,6 @@ mod test {
use schema::{SchemaBuilder, Term, STORED, TEXT};
use DocId;
#[test]
fn test_alive_docs_iterator() {
let mut schema_builder = SchemaBuilder::new();

View File

@@ -77,15 +77,15 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// DirectoryClone
pub trait DirectoryClone {
/// Clones the directory and boxes the clone
fn box_clone(&self) -> Box<Directory>;
/// Clones the directory and boxes the clone
fn box_clone(&self) -> Box<Directory>;
}
impl<T> DirectoryClone for T
where
T: 'static + Directory + Clone,
T: 'static + Directory + Clone,
{
fn box_clone(&self) -> Box<Directory> {
Box::new(self.clone())
}
fn box_clone(&self) -> Box<Directory> {
Box::new(self.clone())
}
}

View File

@@ -2,6 +2,7 @@ use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use error::TantivyError;
use indexer::LockType;
use serde_json;
use std::collections::HashSet;
use std::io;
@@ -12,9 +13,6 @@ use std::sync::RwLockWriteGuard;
use std::sync::{Arc, RwLock};
use Directory;
use Result;
use indexer::LockType;
/// Returns true iff the file is "managed".
/// Non-managed file are not subject to garbage collection.
@@ -108,7 +106,8 @@ impl ManagedDirectory {
//
// releasing the lock as .delete() will use it too.
{
let meta_informations_rlock = self.meta_informations
let meta_informations_rlock = self
.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
@@ -157,7 +156,8 @@ impl ManagedDirectory {
if !deleted_files.is_empty() {
// update the list of managed files by removing
// the file that were removed.
let mut meta_informations_wlock = self.meta_informations
let mut meta_informations_wlock = self
.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
{
@@ -186,9 +186,10 @@ impl ManagedDirectory {
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
// Files starting by "." (e.g. lock files) are not managed.
if !is_managed(filepath) {
return Ok(());
return Ok(());
}
let mut meta_wlock = self.meta_informations
let mut meta_wlock = self
.meta_informations
.write()
.expect("Managed file lock poisoned");
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());

View File

@@ -32,7 +32,8 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
}
})?;
let meta_data = file.metadata()
let meta_data = file
.metadata()
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
@@ -309,7 +310,8 @@ impl Directory for MmapDirectory {
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => self.sync_directory()
Ok(_) => self
.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {

View File

@@ -170,7 +170,8 @@ impl Directory for RAMDirectory {
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self.fs
let exists = self
.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
// force the creation of the file to mimic the MMap directory.
@@ -195,9 +196,10 @@ impl Directory for RAMDirectory {
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
fail_point!("RAMDirectory::atomic_write", |msg| {
Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string())))
});
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or("Undefined".to_string())
)));
let path_buf = PathBuf::from(path);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
self.fs.write(path_buf, &Vec::new())?;

View File

@@ -5,7 +5,6 @@ use fst::raw::MmapReadOnly;
use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::ops::Deref;
/// Read object that represents files in tantivy.
///
/// These read objects are only in charge to deliver

View File

@@ -4,9 +4,9 @@ use std::io;
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use fastfield::FastFieldNotAvailableError;
use indexer::LockType;
use query;
use schema;
use indexer::LockType;
use serde_json;
use std::path::PathBuf;
use std::sync::PoisonError;
@@ -21,7 +21,10 @@ pub enum TantivyError {
#[fail(display = "file already exists: '{:?}'", _0)]
FileAlreadyExists(PathBuf),
/// Failed to acquire file lock
#[fail(display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)]
#[fail(
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
_0
)]
LockFailure(LockType),
/// IO Error.
#[fail(display = "an IO error occurred: '{}'", _0)]
@@ -95,14 +98,13 @@ impl From<schema::DocParsingError> for TantivyError {
}
}
impl From<OpenWriteError> for TantivyError {
fn from(error: OpenWriteError) -> TantivyError {
match error {
OpenWriteError::FileAlreadyExists(filepath) =>
TantivyError::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) =>
TantivyError::IOError(io_error),
OpenWriteError::FileAlreadyExists(filepath) => {
TantivyError::FileAlreadyExists(filepath)
}
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
}.into()
}
}

View File

@@ -41,7 +41,8 @@ pub struct DeleteBitSet {
impl DeleteBitSet {
/// Opens a delete bitset given its data source.
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
let num_deleted: usize = data.as_slice()
let num_deleted: usize = data
.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();

View File

@@ -56,7 +56,8 @@ impl FacetReader {
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
let found_term = self.term_dict
let found_term = self
.term_dict
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
assert!(found_term, "Term ordinal {} no found.", facet_ord);
}

View File

@@ -132,7 +132,8 @@ impl MultiValueIntFastFieldWriter {
);
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
for (start, stop) in self.doc_index
for (start, stop) in self
.doc_index
.windows(2)
.map(|interval| (interval[0], interval[1]))
.chain(Some(last_interval).into_iter())
@@ -148,7 +149,6 @@ impl MultiValueIntFastFieldWriter {
value_serializer.add_val(val)?;
}
}
}
None => {
let val_min_max = self.vals.iter().cloned().minmax();

View File

@@ -1,8 +1,8 @@
use rand::thread_rng;
use std::collections::HashSet;
use rand::Rng;
use rand::distributions::Range;
use rand::Rng;
use schema::*;
use Index;
use Searcher;

View File

@@ -52,7 +52,8 @@ impl DeleteQueue {
//
// Past delete operations are not accessible.
pub fn cursor(&self) -> DeleteCursor {
let last_block = self.inner
let last_block = self
.inner
.read()
.expect("Read lock poisoned when opening delete queue cursor")
.last_block
@@ -92,7 +93,8 @@ impl DeleteQueue {
// be some unflushed operations.
//
fn flush(&self) -> Option<Arc<Block>> {
let mut self_wlock = self.inner
let mut self_wlock = self
.inner
.write()
.expect("Failed to acquire write lock on delete queue writer");
@@ -132,7 +134,8 @@ impl From<DeleteQueue> for NextBlock {
impl NextBlock {
fn next_block(&self) -> Option<Arc<Block>> {
{
let next_read_lock = self.0
let next_read_lock = self
.0
.read()
.expect("Failed to acquire write lock in delete queue");
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
@@ -141,7 +144,8 @@ impl NextBlock {
}
let next_block;
{
let mut next_write_lock = self.0
let mut next_write_lock = self
.0
.write()
.expect("Failed to acquire write lock in delete queue");
match *next_write_lock {

View File

@@ -1,10 +1,10 @@
use directory::error::OpenWriteError;
use Directory;
use TantivyError;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::thread;
use std::time::Duration;
use std::io::Write;
use Directory;
use TantivyError;
#[derive(Debug, Clone, Copy)]
pub enum LockType {
@@ -29,10 +29,9 @@ pub enum LockType {
/// is very simplistic. We retry after `100ms` until we effectively
/// acquire the lock.
/// This lock should not have much contention in normal usage.
MetaLock
MetaLock,
}
/// Retry the logic of acquiring locks is pretty simple.
/// We just retry `n` times after a given `duratio`, both
/// depending on the type of lock.
@@ -49,7 +48,7 @@ impl RetryPolicy {
}
}
fn wait_and_retry(&mut self,) -> bool {
fn wait_and_retry(&mut self) -> bool {
if self.num_retries == 0 {
false
} else {
@@ -58,35 +57,26 @@ impl RetryPolicy {
thread::sleep(wait_duration);
true
}
}
}
impl LockType {
fn retry_policy(&self) -> RetryPolicy {
match *self {
LockType::IndexWriterLock =>
RetryPolicy::no_retry(),
LockType::MetaLock =>
RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
}
LockType::IndexWriterLock => RetryPolicy::no_retry(),
LockType::MetaLock => RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
},
}
}
fn try_acquire_lock(&self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
let path = self.filename();
let mut write = directory
.open_write(path)
.map_err(|e|
match e {
OpenWriteError::FileAlreadyExists(_) =>
TantivyError::LockFailure(*self),
OpenWriteError::IOError(io_error) =>
TantivyError::IOError(io_error),
})?;
let mut write = directory.open_write(path).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(*self),
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
})?;
write.flush()?;
Ok(DirectoryLock {
directory: directory.box_clone(),
@@ -94,7 +84,6 @@ impl LockType {
})
}
/// Acquire a lock in the given directory.
pub fn acquire_lock(&self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
let mut box_directory = directory.box_clone();
@@ -110,25 +99,19 @@ impl LockType {
return Err(TantivyError::LockFailure(filepath.to_owned()));
}
}
Err(_) => {
}
Err(_) => {}
}
}
}
fn filename(&self) -> &Path {
match *self {
LockType::MetaLock => {
Path::new(".tantivy-meta.lock")
}
LockType::IndexWriterLock => {
Path::new(".tantivy-indexer.lock")
}
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
}
}
}
/// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
///

View File

@@ -347,7 +347,8 @@ impl IndexWriter {
}
drop(self.workers_join_handle);
let result = self.segment_updater
let result = self
.segment_updater
.wait_merging_thread()
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
@@ -494,7 +495,8 @@ impl IndexWriter {
let document_receiver = self.document_receiver.clone();
// take the directory lock to create a new index_writer.
let directory_lock = self._directory_lock
let directory_lock = self
._directory_lock
.take()
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
@@ -678,7 +680,7 @@ mod tests {
let err_msg = err.to_string();
assert!(err_msg.contains("Lockfile"));
assert!(err_msg.contains("Possible causes:"))
},
}
_ => panic!("Expected LockfileAlreadyExists error"),
}
}
@@ -864,8 +866,7 @@ mod tests {
assert_eq!(initial_table_size(1_000_000_000), 19);
}
#[cfg(not(feature="no_fail"))]
#[cfg(not(feature = "no_fail"))]
#[test]
fn test_write_commit_fails() {
use fail;
@@ -874,7 +875,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for _ in 0..100 {
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
index_writer.commit().unwrap();

View File

@@ -21,17 +21,17 @@ pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
/// MergePolicyClone
pub trait MergePolicyClone {
/// Returns a boxed clone of the MergePolicy.
fn box_clone(&self) -> Box<MergePolicy>;
/// Returns a boxed clone of the MergePolicy.
fn box_clone(&self) -> Box<MergePolicy>;
}
impl<T> MergePolicyClone for T
where
T: 'static + MergePolicy + Clone,
T: 'static + MergePolicy + Clone,
{
fn box_clone(&self) -> Box<MergePolicy> {
Box::new(self.clone())
}
fn box_clone(&self) -> Box<MergePolicy> {
Box::new(self.clone())
}
}
/// Never merge segments.

View File

@@ -440,7 +440,8 @@ impl IndexMerger {
) -> Result<Option<TermOrdinalMapping>> {
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new();
let field_readers = self.readers
let field_readers = self
.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect::<Vec<_>>();

View File

@@ -51,7 +51,8 @@ impl SegmentRegister {
}
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
let mut segment_ids: Vec<SegmentMeta> = self
.segment_states
.values()
.map(|segment_entry| segment_entry.meta().clone())
.collect();

View File

@@ -143,6 +143,7 @@ extern crate fst;
extern crate fst_regex;
extern crate futures;
extern crate futures_cpupool;
extern crate htmlescape;
extern crate itertools;
extern crate levenshtein_automata;
extern crate num_cpus;
@@ -154,7 +155,6 @@ extern crate stable_deref_trait;
extern crate tempdir;
extern crate tempfile;
extern crate uuid;
extern crate htmlescape;
#[cfg(test)]
#[macro_use]
@@ -183,7 +183,7 @@ mod macros;
pub use error::TantivyError;
#[deprecated(since="0.7.0", note="please use `tantivy::TantivyError` instead")]
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
pub use error::TantivyError as Error;
extern crate census;
@@ -951,4 +951,3 @@ mod tests {
}
}
}

View File

@@ -1,4 +1,3 @@
/// Positions are stored in three parts and over two files.
//
/// The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
@@ -24,13 +23,12 @@
/// The long skip structure makes it possible to skip rapidly to the a checkpoint close to this
/// value, and then skip normally.
///
mod reader;
mod serializer;
pub use self::reader::PositionReader;
pub use self::serializer::PositionSerializer;
use bitpacking::{BitPacker4x, BitPacker};
use bitpacking::{BitPacker, BitPacker4x};
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
@@ -43,10 +41,10 @@ lazy_static! {
#[cfg(test)]
pub mod tests {
use std::iter;
use super::{PositionSerializer, PositionReader};
use super::{PositionReader, PositionSerializer};
use directory::ReadOnlySource;
use positions::COMPRESSION_BLOCK_SIZE;
use std::iter;
fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) {
let mut skip_buffer = vec![];
@@ -59,7 +57,10 @@ pub mod tests {
}
serializer.close().unwrap();
}
(ReadOnlySource::from(stream_buffer), ReadOnlySource::from(skip_buffer))
(
ReadOnlySource::from(stream_buffer),
ReadOnlySource::from(skip_buffer),
)
}
#[test]
@@ -103,7 +104,7 @@ pub mod tests {
assert_eq!(skip.len(), 12);
assert_eq!(stream.len(), 1168);
let mut position_reader = PositionReader::new(stream,skip, 0u64);
let mut position_reader = PositionReader::new(stream, skip, 0u64);
let mut buf = [0u32; 7];
let mut c = 0;
for _ in 0..100 {
@@ -125,7 +126,7 @@ pub mod tests {
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 1_000_000);
let mut position_reader = PositionReader::new(stream,skip, 128 * 1024);
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024);
let mut buf = [0u32; 1];
position_reader.read(&mut buf);
assert_eq!(buf[0], CONST_VAL);
@@ -137,12 +138,17 @@ pub mod tests {
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 4_987_872);
for &offset in &[10, 128 * 1024, 128 * 1024 - 1, 128 * 1024 + 7, 128 * 10 * 1024 + 10] {
let mut position_reader = PositionReader::new(stream.clone(),skip.clone(), offset);
for &offset in &[
10,
128 * 1024,
128 * 1024 - 1,
128 * 1024 + 7,
128 * 10 * 1024 + 10,
] {
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset);
let mut buf = [0u32; 1];
position_reader.read(&mut buf);
assert_eq!(buf[0], offset as u32);
}
}
}

View File

@@ -1,12 +1,12 @@
use bitpacking::{BitPacker4x, BitPacker};
use owned_read::OwnedRead;
use common::{BinarySerializable, FixedSize};
use postings::compression::compressed_block_size;
use directory::ReadOnlySource;
use positions::COMPRESSION_BLOCK_SIZE;
use positions::LONG_SKIP_IN_BLOCKS;
use positions::LONG_SKIP_INTERVAL;
use super::BIT_PACKER;
use bitpacking::{BitPacker, BitPacker4x};
use common::{BinarySerializable, FixedSize};
use directory::ReadOnlySource;
use owned_read::OwnedRead;
use positions::COMPRESSION_BLOCK_SIZE;
use positions::LONG_SKIP_INTERVAL;
use positions::LONG_SKIP_IN_BLOCKS;
use postings::compression::compressed_block_size;
pub struct PositionReader {
skip_read: OwnedRead,
@@ -18,7 +18,6 @@ pub struct PositionReader {
// of the block of the next int to read.
}
// `ahead` represents the offset of the block currently loaded
// compared to the cursor of the actual stream.
//
@@ -32,7 +31,8 @@ fn read_impl(
buffer: &mut [u32; 128],
mut inner_offset: usize,
num_bits: &[u8],
output: &mut [u32]) -> usize {
output: &mut [u32],
) -> usize {
let mut output_start = 0;
let mut output_len = output.len();
let mut ahead = 0;
@@ -47,8 +47,7 @@ fn read_impl(
output_start += available_len;
inner_offset = 0;
let num_bits = num_bits[ahead];
BitPacker4x::new()
.decompress(position, &mut buffer[..], num_bits);
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
let block_len = compressed_block_size(num_bits);
position = &position[block_len..];
ahead += 1;
@@ -56,11 +55,12 @@ fn read_impl(
}
}
impl PositionReader {
pub fn new(position_source: ReadOnlySource,
skip_source: ReadOnlySource,
offset: u64) -> PositionReader {
pub fn new(
position_source: ReadOnlySource,
skip_source: ReadOnlySource,
offset: u64,
) -> PositionReader {
let skip_len = skip_source.len();
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
@@ -70,7 +70,8 @@ impl PositionReader {
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
let offset_num_bytes: u64 = {
if long_skip_id > 0 {
let mut long_skip_blocks: &[u8] = &long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
let mut long_skip_blocks: &[u8] =
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
} else {
0
@@ -79,13 +80,13 @@ impl PositionReader {
let mut position_read = OwnedRead::new(position_source);
position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(skip_body);
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
let mut position_reader = PositionReader {
skip_read,
position_read,
inner_offset: 0,
buffer: Box::new([0u32; 128]),
ahead: None
ahead: None,
};
position_reader.skip(small_skip);
position_reader
@@ -108,7 +109,8 @@ impl PositionReader {
self.buffer.as_mut(),
self.inner_offset,
&skip_data[1..],
output));
output,
));
}
/// Skip the next `skip_len` integer.
@@ -118,23 +120,20 @@ impl PositionReader {
///
/// May panic if the end of the stream is reached.
pub fn skip(&mut self, skip_len: usize) {
let skip_len_plus_inner_offset = skip_len + self.inner_offset;
let num_blocks_to_advance = skip_len_plus_inner_offset / COMPRESSION_BLOCK_SIZE;
self.inner_offset = skip_len_plus_inner_offset % COMPRESSION_BLOCK_SIZE;
self.ahead = self.ahead
.and_then(|num_blocks| {
if num_blocks >= num_blocks_to_advance {
Some(num_blocks_to_advance - num_blocks_to_advance)
} else {
None
}
});
self.ahead = self.ahead.and_then(|num_blocks| {
if num_blocks >= num_blocks_to_advance {
Some(num_blocks_to_advance - num_blocks_to_advance)
} else {
None
}
});
let skip_len = self.skip_read
.as_ref()[..num_blocks_to_advance]
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
.iter()
.cloned()
.map(|num_bit| num_bit as usize)

View File

@@ -1,8 +1,8 @@
use std::io;
use bitpacking::BitPacker;
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
use common::BinarySerializable;
use super::BIT_PACKER;
use bitpacking::BitPacker;
use common::BinarySerializable;
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
use std::io;
pub struct PositionSerializer<W: io::Write> {
write_stream: W,
@@ -23,7 +23,7 @@ impl<W: io::Write> PositionSerializer<W> {
buffer: vec![0u8; 128 * 4],
num_ints: 0u64,
long_skips: Vec::new(),
cumulated_num_bits: 0u64
cumulated_num_bits: 0u64,
}
}
@@ -31,7 +31,6 @@ impl<W: io::Write> PositionSerializer<W> {
self.num_ints
}
fn remaining_block_len(&self) -> usize {
COMPRESSION_BLOCK_SIZE - self.block.len()
}

View File

@@ -28,14 +28,16 @@ impl BlockEncoder {
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) {
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
let written_size = self.bitpacker
.compress_sorted(offset, block, &mut self.output[..], num_bits);
let written_size =
self.bitpacker
.compress_sorted(offset, block, &mut self.output[..], num_bits);
(num_bits, &self.output[..written_size])
}
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) {
let num_bits = self.bitpacker.num_bits(block);
let written_size = self.bitpacker
let written_size = self
.bitpacker
.compress(block, &mut self.output[..], num_bits);
(num_bits, &self.output[..written_size])
}
@@ -62,19 +64,21 @@ impl BlockDecoder {
}
}
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32, num_bits: u8) -> usize {
pub fn uncompress_block_sorted(
&mut self,
compressed_data: &[u8],
offset: u32,
num_bits: u8,
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker.decompress_sorted(
offset,
&compressed_data,
&mut self.output,
num_bits,
)
self.bitpacker
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker.decompress(&compressed_data, &mut self.output, num_bits)
self.bitpacker
.decompress(&compressed_data, &mut self.output, num_bits)
}
#[inline]
@@ -88,7 +92,6 @@ impl BlockDecoder {
}
}
pub trait VIntEncoder {
/// Compresses an array of `u32` integers,
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_ encoding)

View File

@@ -1,9 +1,5 @@
#[inline(always)]
pub fn compress_sorted<'a>(
input: &[u32],
output: &'a mut [u8],
mut offset: u32,
) -> &'a [u8] {
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
@@ -46,11 +42,7 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
}
#[inline(always)]
pub fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize {
let mut read_byte = 0;
let mut result = offset;
let num_els = output.len();

View File

@@ -2,6 +2,7 @@
Postings module (also called inverted index)
*/
pub(crate) mod compression;
/// Postings module
///
/// Postings, also called inverted lists, is the key datastructure
@@ -11,18 +12,17 @@ mod postings_writer;
mod recorder;
mod segment_postings;
mod serializer;
pub(crate) mod compression;
mod skip;
mod stacker;
mod term_info;
mod skip;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
use self::compression::COMPRESSION_BLOCK_SIZE;
pub use self::postings::Postings;
pub use self::term_info::TermInfo;
pub(crate) use self::skip::SkipReader;
use self::compression::{COMPRESSION_BLOCK_SIZE};
pub use self::term_info::TermInfo;
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
@@ -71,8 +71,7 @@ pub mod tests {
let mut segment = index.new_segment();
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
{
let mut field_serializer = posting_serializer
.new_field(text_field, 120 * 4).unwrap();
let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap();
field_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2];
@@ -512,13 +511,13 @@ pub mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for _ in 0..posting_list_size {
let mut doc = Document::default();
if rng.gen_bool(1f64/ 15f64) {
if rng.gen_bool(1f64 / 15f64) {
doc.add_text(text_field, "a");
}
if rng.gen_bool(1f64/ 10f64) {
if rng.gen_bool(1f64 / 10f64) {
doc.add_text(text_field, "b");
}
if rng.gen_bool(1f64/ 5f64) {
if rng.gen_bool(1f64 / 5f64) {
doc.add_text(text_field, "c");
}
doc.add_text(text_field, "d");

View File

@@ -94,7 +94,8 @@ impl MultiFieldPostingsWriter {
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
.term_index
.iter()
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
.collect();

View File

@@ -107,7 +107,8 @@ impl Recorder for TermFrequencyRecorder {
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self.stack
let mut doc_iter = self
.stack
.iter(heap)
.chain(Some(self.current_tf).into_iter());

View File

@@ -1,20 +1,20 @@
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use DocId;
use common::BitSet;
use common::HasLen;
use postings::compression::compressed_block_size;
use common::{BinarySerializable, VInt};
use docset::{DocSet, SkipResult};
use fst::Streamer;
use owned_read::OwnedRead;
use positions::PositionReader;
use postings::compression::compressed_block_size;
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use postings::serializer::PostingsSerializer;
use postings::FreqReadingOption;
use postings::Postings;
use owned_read::OwnedRead;
use common::{VInt, BinarySerializable};
use postings::USE_SKIP_INFO_LIMIT;
use postings::SkipReader;
use postings::USE_SKIP_INFO_LIMIT;
use schema::IndexRecordOption;
use positions::PositionReader;
use std::cmp::Ordering;
use DocId;
const EMPTY_ARR: [u8; 0] = [];
@@ -98,7 +98,7 @@ impl SegmentPostings {
docs.len() as u32,
OwnedRead::new(buffer),
IndexRecordOption::Basic,
IndexRecordOption::Basic
IndexRecordOption::Basic,
);
SegmentPostings::from_block_postings(block_segment_postings, None)
}
@@ -151,7 +151,11 @@ fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
/// The target is assumed smaller or equal to the last element.
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(target, block_docs);
start.wrapping_add(block_docs[start..end].binary_search(&target).unwrap_or_else(|e| e))
start.wrapping_add(
block_docs[start..end]
.binary_search(&target)
.unwrap_or_else(|e| e),
)
}
impl DocSet for SegmentPostings {
@@ -179,21 +183,20 @@ impl DocSet for SegmentPostings {
// check if we need to go to the next block
let need_positions = self.position_computer.is_some();
let mut sum_freqs_skipped: u32 = 0;
if !self.block_cursor
.docs()
.last()
.map(|doc| *doc >= target)
.unwrap_or(false) // there should always be at least a document in the block
// since advance returned.
if !self
.block_cursor
.docs()
.last()
.map(|doc| *doc >= target)
.unwrap_or(false)
// there should always be at least a document in the block
// since advance returned.
{
// we are not in the right block.
//
// First compute all of the freqs skipped from the current block.
if need_positions {
sum_freqs_skipped = self.block_cursor
.freqs()[self.cur..]
.iter()
.sum();
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
match self.block_cursor.skip_to(target) {
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
sum_freqs_skipped += block_skip_freqs;
@@ -215,9 +218,13 @@ impl DocSet for SegmentPostings {
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self.cur.wrapping_add(search_within_block(&block_docs[self.cur..], target));
let new_cur = self
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur].iter().sum::<u32>();
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
.sum::<u32>();
self.position_computer
.as_mut()
.unwrap()
@@ -330,7 +337,10 @@ pub struct BlockSegmentPostings {
skip_reader: SkipReader,
}
fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option<OwnedRead>, OwnedRead) {
fn split_into_skips_and_postings(
doc_freq: u32,
mut data: OwnedRead,
) -> (Option<OwnedRead>, OwnedRead) {
if doc_freq >= USE_SKIP_INFO_LIMIT {
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
let mut postings_data = data.clone();
@@ -345,7 +355,7 @@ fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option<
#[derive(Debug, Eq, PartialEq)]
pub enum BlockSegmentPostingsSkipResult {
Terminated,
Success(u32) //< number of term freqs to skip
Success(u32), //< number of term freqs to skip
}
impl BlockSegmentPostings {
@@ -353,7 +363,7 @@ impl BlockSegmentPostings {
doc_freq: u32,
data: OwnedRead,
record_option: IndexRecordOption,
requested_option: IndexRecordOption
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
@@ -362,11 +372,10 @@ impl BlockSegmentPostings {
};
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let skip_reader =
match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, record_option),
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option)
};
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, record_option),
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option),
};
let doc_freq = doc_freq as usize;
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
BlockSegmentPostings {
@@ -450,7 +459,6 @@ impl BlockSegmentPostings {
self.doc_decoder.output_len
}
/// position on a block that may contains `doc_id`.
/// Always advance the current block.
///
@@ -461,9 +469,7 @@ impl BlockSegmentPostings {
/// Returns false iff all of the document remaining are smaller than
/// `doc_id`. In that case, all of these document are consumed.
///
pub fn skip_to(&mut self,
target_doc: DocId) -> BlockSegmentPostingsSkipResult {
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
let mut skip_freqs = 0u32;
while self.skip_reader.advance() {
if self.skip_reader.doc() >= target_doc {
@@ -472,11 +478,11 @@ impl BlockSegmentPostings {
//
// We found our block!
let num_bits = self.skip_reader.doc_num_bits();
let num_consumed_bytes = self.doc_decoder
.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits);
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits,
);
self.remaining_data.advance(num_consumed_bytes);
let tf_num_bits = self.skip_reader.tf_num_bits();
match self.freq_reading_option {
@@ -486,9 +492,9 @@ impl BlockSegmentPostings {
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(),
tf_num_bits);
let num_consumed_bytes = self
.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
self.remaining_data.advance(num_consumed_bytes);
}
}
@@ -518,7 +524,8 @@ impl BlockSegmentPostings {
}
}
self.num_vint_docs = 0;
return self.docs()
return self
.docs()
.last()
.map(|last_doc| {
if *last_doc >= target_doc {
@@ -538,11 +545,11 @@ impl BlockSegmentPostings {
pub fn advance(&mut self) -> bool {
if self.skip_reader.advance() {
let num_bits = self.skip_reader.doc_num_bits();
let num_consumed_bytes = self.doc_decoder
.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits);
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits,
);
self.remaining_data.advance(num_consumed_bytes);
let tf_num_bits = self.skip_reader.tf_num_bits();
match self.freq_reading_option {
@@ -552,9 +559,9 @@ impl BlockSegmentPostings {
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(),
tf_num_bits);
let num_consumed_bytes = self
.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
self.remaining_data.advance(num_consumed_bytes);
}
}
@@ -594,7 +601,6 @@ impl BlockSegmentPostings {
doc_offset: 0,
doc_freq: 0,
remaining_data: OwnedRead::new(vec![]),
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
}
@@ -616,7 +622,9 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
use super::SegmentPostings;
use common::HasLen;
use core::Index;
@@ -626,9 +634,7 @@ mod tests {
use schema::SchemaBuilder;
use schema::Term;
use schema::INT_INDEXED;
use super::BlockSegmentPostingsSkipResult;
use DocId;
use super::search_within_block;
#[test]
fn test_empty_segment_postings() {
@@ -645,7 +651,6 @@ mod tests {
assert_eq!(postings.doc_freq(), 0);
}
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block
.iter()
@@ -653,11 +658,15 @@ mod tests {
.enumerate()
.filter(|&(_, ref val)| *val >= target)
.next()
.unwrap().0
.unwrap()
.0
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(search_within_block(block, target), search_within_block_trivial_but_slow(block, target));
assert_eq!(
search_within_block(block, target),
search_within_block_trivial_but_slow(block, target)
);
}
fn util_test_search_within_block_all(block: &[u32]) {
@@ -677,7 +686,7 @@ mod tests {
#[test]
fn test_search_within_block() {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i*2).collect();
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_within_block_all(&v[..]);
}
}
@@ -726,14 +735,22 @@ mod tests {
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32));
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Terminated);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(block_postings.skip_to(4u32), BlockSegmentPostingsSkipResult::Terminated);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
);
}
#[test]
fn test_block_segment_postings_skip2() {
let mut docs = vec![0];
@@ -741,14 +758,23 @@ mod tests {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(docs.clone());
for i in vec![0, 424, 10000] {
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32));
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
);
let docs = block_postings.docs();
assert!(docs[0] <= i);
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
}
assert_eq!(block_postings.skip_to(100_000), BlockSegmentPostingsSkipResult::Terminated);
assert_eq!(block_postings.skip_to(101_000), BlockSegmentPostingsSkipResult::Terminated);
assert_eq!(
block_postings.skip_to(100_000),
BlockSegmentPostingsSkipResult::Terminated
);
assert_eq!(
block_postings.skip_to(101_000),
BlockSegmentPostingsSkipResult::Terminated
);
}
#[test]

View File

@@ -1,18 +1,18 @@
use super::TermInfo;
use common::{VInt, BinarySerializable};
use common::{BinarySerializable, VInt};
use common::{CompositeWrite, CountingWriter};
use postings::compression::{VIntEncoder, BlockEncoder, COMPRESSION_BLOCK_SIZE};
use core::Segment;
use directory::WritePtr;
use positions::PositionSerializer;
use postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
use postings::skip::SkipSerializer;
use postings::USE_SKIP_INFO_LIMIT;
use schema::Schema;
use schema::{Field, FieldEntry, FieldType};
use std::io::{self, Write};
use termdict::{TermDictionaryBuilder, TermOrdinal};
use DocId;
use Result;
use postings::USE_SKIP_INFO_LIMIT;
use postings::skip::SkipSerializer;
use positions::PositionSerializer;
/// `PostingsSerializer` is in charge of serializing
/// postings on disk, in the
@@ -104,7 +104,7 @@ impl InvertedIndexSerializer {
term_dictionary_write,
postings_write,
positions_write,
positionsidx_write
positionsidx_write,
)
}
@@ -135,7 +135,7 @@ impl<'a> FieldSerializer<'a> {
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
positionsidx_write: &'a mut CountingWriter<WritePtr>
positionsidx_write: &'a mut CountingWriter<WritePtr>,
) -> io::Result<FieldSerializer<'a>> {
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => {
@@ -153,7 +153,8 @@ impl<'a> FieldSerializer<'a> {
};
let term_dictionary_builder =
TermDictionaryBuilder::new(term_dictionary_write, field_type)?;
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
let postings_serializer =
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
let positions_serializer_opt = if position_enabled {
Some(PositionSerializer::new(positions_write, positionsidx_write))
} else {
@@ -171,14 +172,15 @@ impl<'a> FieldSerializer<'a> {
}
fn current_term_info(&self) -> TermInfo {
let positions_idx = self.positions_serializer_opt
let positions_idx = self
.positions_serializer_opt
.as_ref()
.map(|positions_serializer| positions_serializer.positions_idx())
.unwrap_or(0u64);
TermInfo {
doc_freq: 0,
postings_offset: self.postings_serializer.addr(),
positions_idx
positions_idx,
}
}
@@ -253,7 +255,7 @@ impl<'a> FieldSerializer<'a> {
struct Block {
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
len: usize
len: usize,
}
impl Block {
@@ -261,7 +263,7 @@ impl Block {
Block {
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
len: 0
len: 0,
}
}
@@ -312,9 +314,12 @@ pub struct PostingsSerializer<W: Write> {
termfreq_sum_enabled: bool,
}
impl<W: Write> PostingsSerializer<W> {
pub fn new(write: W, termfreq_enabled: bool, termfreq_sum_enabled: bool) -> PostingsSerializer<W> {
pub fn new(
write: W,
termfreq_enabled: bool,
termfreq_sum_enabled: bool,
) -> PostingsSerializer<W> {
PostingsSerializer {
output_write: CountingWriter::wrap(write),
@@ -337,14 +342,16 @@ impl<W: Write> PostingsSerializer<W> {
.block_encoder
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.skip_write.write_doc(self.last_doc_id_encoded, num_bits);
self.skip_write
.write_doc(self.last_doc_id_encoded, num_bits);
// last el block 0, offset block 1,
self.postings_write.extend(block_encoded);
}
if self.termfreq_enabled {
// encode the term_freqs
let (num_bits, block_encoded): (u8, &[u8]) =
self.block_encoder.compress_block_unsorted(&self.block.term_freqs());
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(&self.block.term_freqs());
self.postings_write.extend(block_encoded);
self.skip_write.write_term_freq(num_bits);
if self.termfreq_sum_enabled {
@@ -375,13 +382,15 @@ impl<W: Write> PostingsSerializer<W> {
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self.block_encoder
let block_encoded = self
.block_encoder
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
}
// ... Idem for term frequencies
if self.termfreq_enabled {
let block_encoded = self.block_encoder
let block_encoded = self
.block_encoder
.compress_vint_unsorted(self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
}
@@ -392,7 +401,6 @@ impl<W: Write> PostingsSerializer<W> {
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
self.output_write.write_all(skip_data)?;
self.output_write.write_all(&self.postings_write[..])?;
} else {
self.output_write.write_all(&self.postings_write[..])?;
}

View File

@@ -1,8 +1,8 @@
use DocId;
use common::BinarySerializable;
use owned_read::OwnedRead;
use postings::compression::COMPRESSION_BLOCK_SIZE;
use schema::IndexRecordOption;
use DocId;
pub struct SkipSerializer {
buffer: Vec<u8>,
@@ -18,8 +18,11 @@ impl SkipSerializer {
}
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
assert!(last_doc > self.prev_doc, "write_doc(...) called with non-increasing doc ids. \
Did you forget to call clear maybe?");
assert!(
last_doc > self.prev_doc,
"write_doc(...) called with non-increasing doc ids. \
Did you forget to call clear maybe?"
);
let delta_doc = last_doc - self.prev_doc;
self.prev_doc = last_doc;
delta_doc.serialize(&mut self.buffer).unwrap();
@@ -30,9 +33,10 @@ impl SkipSerializer {
self.buffer.push(tf_num_bits);
}
pub fn write_total_term_freq(&mut self, tf_sum: u32) {
tf_sum.serialize(&mut self.buffer).expect("Should never fail");
tf_sum
.serialize(&mut self.buffer)
.expect("Should never fail");
}
pub fn data(&self) -> &[u8] {
@@ -103,33 +107,32 @@ impl SkipReader {
} else {
let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
self.doc += doc_delta as DocId;
self.doc_num_bits = self.owned_read.get(0);
self.doc_num_bits = self.owned_read.get(0);
match self.skip_info {
IndexRecordOption::Basic => {
self.owned_read.advance(1);
}
IndexRecordOption::WithFreqs=> {
IndexRecordOption::WithFreqs => {
self.tf_num_bits = self.owned_read.get(1);
self.owned_read.advance(2);
}
IndexRecordOption::WithFreqsAndPositions => {
self.tf_num_bits = self.owned_read.get(1);
self.owned_read.advance(2);
self.tf_sum = u32::deserialize(&mut self.owned_read)
.expect("Failed reading tf_sum");
self.tf_sum =
u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum");
}
}
true
}
}
}
#[cfg(test)]
mod tests {
use super::{SkipReader, SkipSerializer};
use super::IndexRecordOption;
use super::{SkipReader, SkipSerializer};
use owned_read::OwnedRead;
#[test]
@@ -171,4 +174,4 @@ mod tests {
assert_eq!(skip_reader.doc_num_bits(), 5u8);
assert!(!skip_reader.advance());
}
}
}

View File

@@ -5,8 +5,8 @@ use query::TermQuery;
use query::Weight;
use schema::IndexRecordOption;
use schema::Term;
use Result;
use std::collections::BTreeSet;
use Result;
use Searcher;
/// The boolean query combines a set of queries
@@ -41,9 +41,9 @@ impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
}
impl Query for BooleanQuery {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
let sub_weights = self.subqueries
let sub_weights = self
.subqueries
.iter()
.map(|&(ref occur, ref subquery)| {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))

View File

@@ -1,11 +1,11 @@
use super::Scorer;
use DocSet;
use Score;
use DocId;
use query::Query;
use Result;
use Searcher;
use query::Weight;
use DocId;
use DocSet;
use Result;
use Score;
use Searcher;
use SegmentReader;
/// `EmptyQuery` is a dummy `Query` in which no document matches.

View File

@@ -3,11 +3,11 @@ Query
*/
mod all_query;
mod empty_query;
mod automaton_weight;
mod bitset;
mod bm25;
mod boolean_query;
mod empty_query;
mod exclude;
mod fuzzy_query;
mod intersection;
@@ -34,10 +34,10 @@ pub use self::union::Union;
pub use self::vec_docset::VecDocSet;
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
pub use self::empty_query::{EmptyQuery, EmptyWeight, EmptyScorer};
pub use self::automaton_weight::AutomatonWeight;
pub use self::bitset::BitSetDocSet;
pub use self::boolean_query::BooleanQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers;

View File

@@ -46,4 +46,4 @@ pub fn compose_occur(left: Occur, right: Occur) -> Occur {
}
}
}
}
}

View File

@@ -5,8 +5,8 @@ use query::bm25::BM25Weight;
use query::Query;
use query::Weight;
use schema::{Field, Term};
use Result;
use std::collections::BTreeSet;
use Result;
/// `PhraseQuery` matches a specific sequence of words.
///

View File

@@ -124,7 +124,8 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
fieldnorm_reader: FieldNormReader,
score_needed: bool,
) -> PhraseScorer<TPostings> {
let max_offset = term_postings.iter()
let max_offset = term_postings
.iter()
.map(|&(offset, _)| offset)
.max()
.unwrap_or(0);

View File

@@ -30,7 +30,6 @@ impl PhraseWeight {
}
impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let similarity_weight = self.similarity_weight.clone();
let field = self.phrase_terms[0].1.field();

View File

@@ -2,10 +2,10 @@ use super::Weight;
use collector::Collector;
use core::searcher::Searcher;
use downcast;
use std::collections::BTreeSet;
use std::fmt;
use Result;
use SegmentLocalId;
use std::collections::BTreeSet;
use Term;
/// The `Query` trait defines a set of documents and a scoring method

View File

@@ -1,8 +1,8 @@
use super::user_input_ast::*;
use combine::char::*;
use combine::*;
use combine::stream::StreamErrorFor;
use combine::error::StreamError;
use combine::stream::StreamErrorFor;
use combine::*;
use query::occur::Occur;
use query::query_parser::user_input_ast::UserInputBound;
@@ -123,7 +123,8 @@ parser! {
}
enum BinaryOperand {
Or, And
Or,
And,
}
parser! {
@@ -138,19 +139,16 @@ parser! {
}
}
enum Element {
SingleEl(UserInputAST),
NormalDisjunctive(Vec<Vec<UserInputAST>>)
NormalDisjunctive(Vec<Vec<UserInputAST>>),
}
impl Element {
pub fn into_dnf(self) -> Vec<Vec<UserInputAST>> {
match self {
Element::NormalDisjunctive(conjunctions) =>
conjunctions,
Element::SingleEl(el) =>
vec!(vec!(el)),
Element::NormalDisjunctive(conjunctions) => conjunctions,
Element::SingleEl(el) => vec![vec![el]],
}
}
}
@@ -227,10 +225,12 @@ mod test {
assert!(parse_to_ast().parse(query).is_err());
}
#[test]
fn test_parse_query_to_ast_not_op() {
assert_eq!(format!("{:?}", parse_to_ast().parse("NOT")), "Err(UnexpectedParse)");
assert_eq!(
format!("{:?}", parse_to_ast().parse("NOT")),
"Err(UnexpectedParse)"
);
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
test_parse_query_to_ast_helper("NOT a", "-(\"a\")");
}
@@ -241,10 +241,22 @@ mod test {
test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))");
test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))");
test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))");
assert_eq!(format!("{:?}", parse_to_ast().parse("a OR b aaa")), "Err(UnexpectedParse)");
assert_eq!(format!("{:?}", parse_to_ast().parse("a AND b aaa")), "Err(UnexpectedParse)");
assert_eq!(format!("{:?}", parse_to_ast().parse("aaa a OR b ")), "Err(UnexpectedParse)");
assert_eq!(format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), "Err(UnexpectedParse)");
assert_eq!(
format!("{:?}", parse_to_ast().parse("a OR b aaa")),
"Err(UnexpectedParse)"
);
assert_eq!(
format!("{:?}", parse_to_ast().parse("a AND b aaa")),
"Err(UnexpectedParse)"
);
assert_eq!(
format!("{:?}", parse_to_ast().parse("aaa a OR b ")),
"Err(UnexpectedParse)"
);
assert_eq!(
format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")),
"Err(UnexpectedParse)"
);
}
#[test]

View File

@@ -1,11 +1,14 @@
use super::logical_ast::*;
use super::query_grammar::parse_to_ast;
use super::user_input_ast::*;
use combine::Parser;
use core::Index;
use query::occur::compose_occur;
use query::query_parser::logical_ast::LogicalAST;
use query::AllQuery;
use query::BooleanQuery;
use query::EmptyQuery;
use query::Occur;
use query::occur::compose_occur;
use query::PhraseQuery;
use query::Query;
use query::RangeQuery;
@@ -18,10 +21,6 @@ use std::num::ParseIntError;
use std::ops::Bound;
use std::str::FromStr;
use tokenizer::TokenizerManager;
use combine::Parser;
use query::EmptyQuery;
use query::query_parser::logical_ast::LogicalAST;
/// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq)]
@@ -59,23 +58,24 @@ impl From<ParseIntError> for QueryParserError {
}
}
/// Recursively remove empty clause from the AST
///
/// Returns `None` iff the `logical_ast` ended up being empty.
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
match logical_ast {
LogicalAST::Clause(children) => {
let trimmed_children = children.into_iter()
.flat_map(|(occur, child)|
trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
let trimmed_children = children
.into_iter()
.flat_map(|(occur, child)| {
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
})
.collect::<Vec<_>>();
if trimmed_children.is_empty() {
None
} else {
Some(LogicalAST::Clause(trimmed_children))
}
},
}
_ => Some(logical_ast),
}
}
@@ -188,8 +188,9 @@ impl QueryParser {
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, _remaining) =
parse_to_ast().parse(query).map_err(|_| QueryParserError::SyntaxError)?;
let (user_input_ast, _remaining) = parse_to_ast()
.parse(query)
.map_err(|_| QueryParserError::SyntaxError)?;
self.compute_logical_ast(user_input_ast)
}
@@ -291,12 +292,9 @@ impl QueryParser {
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let terms = self.compute_terms_for_string(field, phrase)?;
match &terms[..] {
[] =>
Ok(None),
[(_, term)] =>
Ok(Some(LogicalLiteral::Term(term.clone()))),
_ =>
Ok(Some(LogicalLiteral::Phrase(terms.clone()))),
[] => Ok(None),
[(_, term)] => Ok(Some(LogicalLiteral::Term(term.clone()))),
_ => Ok(Some(LogicalLiteral::Phrase(terms.clone()))),
}
}
@@ -308,7 +306,11 @@ impl QueryParser {
}
}
fn resolve_bound(&self, field: Field, bound: &UserInputBound) -> Result<Bound<Term>, QueryParserError> {
fn resolve_bound(
&self,
field: Field,
bound: &UserInputBound,
) -> Result<Bound<Term>, QueryParserError> {
if bound.term_str() == "*" {
return Ok(Bound::Unbounded);
}
@@ -355,18 +357,21 @@ impl QueryParser {
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
UserInputAST::Unary(left_occur, subquery) => {
let (right_occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
let (right_occur, logical_sub_queries) =
self.compute_logical_ast_with_occur(*subquery)?;
Ok((compose_occur(left_occur, right_occur), logical_sub_queries))
}
UserInputAST::Leaf(leaf) => {
UserInputAST::Leaf(leaf) => {
let result_ast = self.compute_logical_ast_from_leaf(*leaf)?;
Ok((Occur::Should, result_ast))
}
}
}
fn compute_logical_ast_from_leaf(&self, leaf: UserInputLeaf) -> Result<LogicalAST, QueryParserError> {
fn compute_logical_ast_from_leaf(
&self,
leaf: UserInputLeaf,
) -> Result<LogicalAST, QueryParserError> {
match leaf {
UserInputLeaf::Literal(literal) => {
let term_phrases: Vec<(Field, String)> = match literal.field_name {
@@ -391,21 +396,19 @@ impl QueryParser {
asts.push(LogicalAST::Leaf(Box::new(ast)));
}
}
let result_ast: LogicalAST =
if asts.len() == 1 {
asts.into_iter().next().unwrap()
} else {
LogicalAST::Clause(
asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
let result_ast: LogicalAST = if asts.len() == 1 {
asts.into_iter().next().unwrap()
} else {
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
};
Ok(result_ast)
}
UserInputLeaf::All => {
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All)))
}
UserInputLeaf::Range { field, lower, upper } => {
UserInputLeaf::All => Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))),
UserInputLeaf::Range {
field,
lower,
upper,
} => {
let fields = self.resolved_fields(&field)?;
let mut clauses = fields
.iter()
@@ -433,14 +436,15 @@ impl QueryParser {
Ok(result_ast)
}
}
}
}
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
match logical_literal {
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
LogicalLiteral::Phrase(term_with_offsets) => Box::new(PhraseQuery::new_with_offset(term_with_offsets)),
LogicalLiteral::Phrase(term_with_offsets) => {
Box::new(PhraseQuery::new_with_offset(term_with_offsets))
}
LogicalLiteral::Range {
field,
value_type,
@@ -458,11 +462,16 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
assert!(
!occur_subqueries.is_empty(),
"Should not be empty after trimming"
);
Box::new(BooleanQuery::from(occur_subqueries))
},
Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
None => Box::new(EmptyQuery)
}
Some(LogicalAST::Leaf(trimmed_logical_literal)) => {
convert_literal_to_query(*trimmed_logical_literal)
}
None => Box::new(EmptyQuery),
}
}
@@ -475,7 +484,7 @@ mod test {
use schema::Field;
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
use tokenizer::{LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager};
use Index;
fn make_query_parser() -> QueryParser {
@@ -498,9 +507,11 @@ mod test {
let schema = schema_builder.build();
let default_fields = vec![title, text];
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()]))
tokenizer_manager.register(
"en_with_stop_words",
SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()])),
);
QueryParser::new(schema, default_fields, tokenizer_manager)
}
@@ -571,16 +582,8 @@ mod test {
#[test]
pub fn test_parse_query_empty() {
test_parse_query_to_logical_ast_helper(
"",
"<emptyclause>",
false,
);
test_parse_query_to_logical_ast_helper(
" ",
"<emptyclause>",
false,
);
test_parse_query_to_logical_ast_helper("", "<emptyclause>", false);
test_parse_query_to_logical_ast_helper(" ", "<emptyclause>", false);
let query_parser = make_query_parser();
let query_result = query_parser.parse_query("");
let query = query_result.unwrap();
@@ -693,11 +696,7 @@ mod test {
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
false,
);
test_parse_query_to_logical_ast_helper(
"*",
"*",
false,
);
test_parse_query_to_logical_ast_helper("*", "*", false);
}
#[test]

View File

@@ -16,9 +16,7 @@ pub enum UserInputLeaf {
impl Debug for UserInputLeaf {
fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
match self {
UserInputLeaf::Literal(literal) => {
literal.fmt(formatter)
}
UserInputLeaf::Literal(literal) => literal.fmt(formatter),
UserInputLeaf::Range {
ref field,
ref lower,
@@ -82,13 +80,12 @@ impl UserInputBound {
pub enum UserInputAST {
Clause(Vec<UserInputAST>),
Unary(Occur, Box<UserInputAST>),
// Not(Box<UserInputAST>),
// Should(Box<UserInputAST>),
// Must(Box<UserInputAST>),
// Not(Box<UserInputAST>),
// Should(Box<UserInputAST>),
// Must(Box<UserInputAST>),
Leaf(Box<UserInputLeaf>),
}
impl UserInputAST {
pub fn unary(self, occur: Occur) -> UserInputAST {
UserInputAST::Unary(occur, Box::new(self))
@@ -100,12 +97,10 @@ impl UserInputAST {
if asts.len() == 1 {
asts.into_iter().next().unwrap() //< safe
} else {
UserInputAST::Clause(asts
.into_iter()
.map(|ast: UserInputAST|
ast.unary(occur)
)
.collect::<Vec<_>>()
UserInputAST::Clause(
asts.into_iter()
.map(|ast: UserInputAST| ast.unary(occur))
.collect::<Vec<_>>(),
)
}
}
@@ -117,11 +112,8 @@ impl UserInputAST {
pub fn or(asts: Vec<UserInputAST>) -> UserInputAST {
UserInputAST::compose(Occur::Should, asts)
}
}
/*
impl UserInputAST {

View File

@@ -274,7 +274,6 @@ impl RangeWeight {
}
impl Weight for RangeWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
@@ -370,9 +369,7 @@ mod tests {
let searcher = index.searcher();
let count_multiples = |range_query: RangeQuery| {
let mut count_collector = CountCollector::default();
range_query
.search(&searcher, &mut count_collector)
.unwrap();
range_query.search(&searcher, &mut count_collector).unwrap();
count_collector.count()
};

View File

@@ -50,7 +50,6 @@ impl Scorer for Box<Scorer> {
}
}
/// Wraps a `DocSet` and simply returns a constant `Scorer`.
/// The `ConstScorer` is useful if you have a `DocSet` where
/// you needed a scorer.

View File

@@ -3,10 +3,10 @@ use query::bm25::BM25Weight;
use query::Query;
use query::Weight;
use schema::IndexRecordOption;
use std::collections::BTreeSet;
use Result;
use Searcher;
use Term;
use std::collections::BTreeSet;
/// A Term query matches all of the documents
/// containing a specific term.

View File

@@ -444,7 +444,10 @@ mod tests {
)
.unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton"));
assert_eq!(
doc.get_first(author_field).unwrap().text(),
Some("fulmicoton")
);
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
}

View File

@@ -1,14 +1,14 @@
use htmlescape::encode_minimal;
use std::collections::BTreeMap;
use tokenizer::{Token, TokenStream};
use Result;
use query::Query;
use Searcher;
use schema::Field;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use tokenizer::BoxedTokenizer;
use tokenizer::{Token, TokenStream};
use Document;
use std::cmp::Ordering;
use Result;
use Searcher;
const DEFAULT_MAX_NUM_CHARS: usize = 150;
@@ -75,11 +75,10 @@ const HIGHLIGHTEN_PREFIX: &str = "<b>";
const HIGHLIGHTEN_POSTFIX: &str = "</b>";
impl Snippet {
pub fn empty() -> Snippet {
Snippet {
fragments: String::new(),
highlighted: Vec::new()
highlighted: Vec::new(),
}
}
@@ -157,16 +156,17 @@ fn select_best_fragment_combination<'a>(
fragments: Vec<FragmentCandidate>,
text: &'a str,
) -> Snippet {
let best_fragment_opt = fragments
.iter()
.max_by(|left, right| {
let cmp_score = left.score.partial_cmp(&right.score).unwrap_or(Ordering::Equal);
if cmp_score == Ordering::Equal {
(right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset))
} else {
cmp_score
}
});
let best_fragment_opt = fragments.iter().max_by(|left, right| {
let cmp_score = left
.score
.partial_cmp(&right.score)
.unwrap_or(Ordering::Equal);
if cmp_score == Ordering::Equal {
(right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset))
} else {
cmp_score
}
});
if let Some(fragment) = best_fragment_opt {
let fragment_text = &text[fragment.start_offset..fragment.stop_offset];
let highlighted = fragment
@@ -177,7 +177,8 @@ fn select_best_fragment_combination<'a>(
item.start - fragment.start_offset,
item.stop - fragment.start_offset,
)
}).collect();
})
.collect();
Snippet {
fragments: fragment_text.to_string(),
highlighted: highlighted,
@@ -239,17 +240,16 @@ pub struct SnippetGenerator {
terms_text: BTreeMap<String, f32>,
tokenizer: Box<BoxedTokenizer>,
field: Field,
max_num_chars: usize
max_num_chars: usize,
}
impl SnippetGenerator {
/// Creates a new snippet generator
pub fn new(searcher: &Searcher,
query: &Query,
field: Field) -> Result<SnippetGenerator> {
pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
let mut terms = BTreeSet::new();
query.query_terms(&mut terms);
let terms_text: BTreeMap<String, f32> = terms.into_iter()
let terms_text: BTreeMap<String, f32> = terms
.into_iter()
.filter(|term| term.field() == field)
.map(|term| (term.text().to_string(), 1f32))
.collect();
@@ -258,7 +258,7 @@ impl SnippetGenerator {
terms_text,
tokenizer,
field,
max_num_chars: DEFAULT_MAX_NUM_CHARS
max_num_chars: DEFAULT_MAX_NUM_CHARS,
})
}
@@ -272,7 +272,8 @@ impl SnippetGenerator {
/// This method extract the text associated to the `SnippetGenerator`'s field
/// and computes a snippet.
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
let text: String = doc.get_all(self.field)
let text: String = doc
.get_all(self.field)
.into_iter()
.flat_map(|val| val.text())
.collect::<Vec<&str>>()
@@ -282,10 +283,12 @@ impl SnippetGenerator {
/// Generates a snippet for the given text.
pub fn snippet(&self, text: &str) -> Snippet {
let fragment_candidates = search_fragments(&*self.tokenizer,
&text,
&self.terms_text,
self.max_num_chars);
let fragment_candidates = search_fragments(
&*self.tokenizer,
&text,
&self.terms_text,
self.max_num_chars,
);
select_best_fragment_combination(fragment_candidates, &text)
}
}
@@ -293,16 +296,16 @@ impl SnippetGenerator {
#[cfg(test)]
mod tests {
use super::{search_fragments, select_best_fragment_combination};
use query::QueryParser;
use schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions};
use std::collections::BTreeMap;
use std::iter::Iterator;
use tokenizer::{box_tokenizer, SimpleTokenizer};
use Index;
use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing};
use SnippetGenerator;
use query::QueryParser;
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which
const TEST_TEXT: &'static str =
r#"Rust is a systems programming language sponsored by Mozilla which
describes it as a "safe, concurrent, practical language", supporting functional and
imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],
but its designers intend it to provide better memory safety while still maintaining
@@ -431,7 +434,7 @@ Survey in 2016, 2017, and 2018."#;
let text = "a b c d";
let terms = BTreeMap::new();
let terms = BTreeMap::new();
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 0);
@@ -442,12 +445,12 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_generator() {
let mut schema_builder = SchemaBuilder::default ();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default()
let mut schema_builder = SchemaBuilder::default();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("en_stem")
.set_index_option(IndexRecordOption::Basic)
);
.set_index_option(IndexRecordOption::Basic),
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -474,6 +477,5 @@ Survey in 2016, 2017, and 2018."#;
let snippet = snippet_generator.snippet(TEST_TEXT);
assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
}
}
}

View File

@@ -109,7 +109,13 @@ pub mod tests {
let store = StoreReader::from_source(store_source);
for i in 0..1_000 {
assert_eq!(
*store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(),
*store
.get(i)
.unwrap()
.get_first(field_title)
.unwrap()
.text()
.unwrap(),
format!("Doc {}", i)
);
}

View File

@@ -72,7 +72,8 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
let mut skip_pointer = self.data_layer.insert(key, dest)?;
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
Some((skip_doc_id, skip_offset)) => self
.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)?,
None => {
return Ok(());

View File

@@ -59,7 +59,6 @@ impl TermInfoBlockMeta {
}
fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo {
let num_bits = self.num_bits() as usize;
let mut cursor = num_bits * inner_offset;

View File

@@ -164,7 +164,8 @@ impl TermDictionary {
let fst = self.fst_index.as_fst();
let mut node = fst.root();
while ord != 0 || !node.is_final() {
if let Some(transition) = node.transitions()
if let Some(transition) = node
.transitions()
.take_while(|transition| transition.out.value() <= ord)
.last()
{

View File

@@ -31,7 +31,6 @@ fn to_lowercase_unicode(text: &mut String, output: &mut String) {
}
}
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
@@ -50,7 +49,7 @@ where
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
@@ -68,41 +67,43 @@ where
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
LowerCaserTokenStream {
tail,
buffer: String::with_capacity(100)
buffer: String::with_capacity(100),
}
}
}
#[cfg(test)]
mod tests {
use tokenizer::Tokenizer;
use tokenizer::LowerCaser;
use tokenizer::TokenStream;
use tokenizer::SimpleTokenizer;
use tokenizer::TokenStream;
use tokenizer::Tokenizer;
#[test]
fn test_to_lower_case() {
assert_eq!(lowercase_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]);
assert_eq!(
lowercase_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]
);
}
fn lowercase_helper(text: &str) -> Vec<String> {
let mut tokens = vec![];
let mut token_stream = SimpleTokenizer
.filter(LowerCaser)
.token_stream(text);
let mut token_stream = SimpleTokenizer.filter(LowerCaser).token_stream(text);
while token_stream.advance() {
let token_text = token_stream.token().text.clone();
tokens.push(token_text);
}
tokens
}
}
#[test]
fn test_lowercaser() {
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
assert_eq!(
lowercase_helper("Русский"),
vec!["русский".to_string()]
);
}
}
}

View File

@@ -151,8 +151,8 @@ pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::Stemmer;
pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain;
pub use self::tokenizer::BoxedTokenizer;
pub(crate) use self::tokenizer::box_tokenizer;
pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::tokenizer_manager::TokenizerManager;

View File

@@ -18,7 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
offset_to: text.len(),
position: 0,
text: text.to_string(),
position_length: 1
position_length: 1,
};
RawTokenStream {
token,

View File

@@ -71,13 +71,16 @@ where
#[cfg(test)]
mod tests {
use super::POSITION_GAP;
use super::super::{SimpleTokenizer, TokenStream, Tokenizer};
use super::TokenStreamChain;
use super::super::{Tokenizer, TokenStream, SimpleTokenizer};
use super::POSITION_GAP;
#[test]
fn test_chain_first_emits_no_tokens() {
let token_streams = vec![SimpleTokenizer.token_stream(""), SimpleTokenizer.token_stream("hello world")];
let token_streams = vec![
SimpleTokenizer.token_stream(""),
SimpleTokenizer.token_stream("hello world"),
];
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams);
assert!(token_chain.advance());
@@ -91,8 +94,8 @@ mod tests {
assert_eq!(token_chain.token().offset_from, 6);
assert_eq!(token_chain.token().offset_to, 11);
assert_eq!(token_chain.token().position, POSITION_GAP);
assert!(!token_chain.advance());
}
}
}

View File

@@ -276,7 +276,7 @@ mod test {
offset_from: 2,
offset_to: 3,
text: "abc".to_string(),
position_length: 1
position_length: 1,
};
let t2 = t1.clone();