mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 09:00:42 +00:00
Cargo Format (#420)
This commit is contained in:
@@ -10,7 +10,6 @@
|
||||
// - search for the best document matchings "sea whale"
|
||||
// - retrieve the best document original content.
|
||||
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
@@ -235,9 +234,7 @@ fn main() -> tantivy::Result<()> {
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
use tempdir::TempDir;
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
// In this example, we'll see how to define a tokenizer pipeline
|
||||
// by aligning a bunch of `TokenFilter`.
|
||||
|
||||
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopCollector;
|
||||
@@ -12,7 +11,6 @@ use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::Index;
|
||||
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
|
||||
@@ -11,10 +11,9 @@
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::query::TermQuery;
|
||||
|
||||
|
||||
// A simple helper function to fetch a single document
|
||||
// given its id from our index.
|
||||
@@ -31,7 +30,7 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Op
|
||||
let mut top_collector = TopCollector::with_limit(1);
|
||||
searcher.search(&term_query, &mut top_collector)?;
|
||||
|
||||
if let Some(doc_address) = top_collector.docs().first() {
|
||||
if let Some(doc_address) = top_collector.docs().first() {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
Ok(Some(doc))
|
||||
} else {
|
||||
@@ -41,7 +40,6 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Op
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
// Check out the *basic_search* example if this makes
|
||||
@@ -126,7 +124,6 @@ fn main() -> tantivy::Result<()> {
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
|
||||
|
||||
// You are guaranteed that your clients will only observe your index in
|
||||
// the state it was in after a commit.
|
||||
// In this example, your search engine will at no point be missing the *Frankenstein* document.
|
||||
@@ -143,4 +140,4 @@ fn main() -> tantivy::Result<()> {
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,60 +22,60 @@ use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
|
||||
// this is our faceted field
|
||||
schema_builder.add_facet_field("tags");
|
||||
// this is our faceted field
|
||||
schema_builder.add_facet_field("tags");
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
let name = schema.get_field("name").unwrap();
|
||||
let tags = schema.get_field("tags").unwrap();
|
||||
let name = schema.get_field("name").unwrap();
|
||||
let tags = schema.get_field("tags").unwrap();
|
||||
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
index_writer.add_document(doc!(
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
index_writer.add_document(doc!(
|
||||
name => "the ditch",
|
||||
tags => Facet::from("/pools/north")
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
index_writer.add_document(doc!(
|
||||
name => "little stacey",
|
||||
tags => Facet::from("/pools/south")
|
||||
));
|
||||
|
||||
index_writer.commit()?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(tags);
|
||||
facet_collector.add_facet("/pools");
|
||||
let mut facet_collector = FacetCollector::for_field(tags);
|
||||
facet_collector.add_facet("/pools");
|
||||
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
|
||||
let counts = facet_collector.harvest();
|
||||
// This lists all of the facet counts
|
||||
let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![
|
||||
(&Facet::from("/pools/north"), 1),
|
||||
(&Facet::from("/pools/south"), 1)
|
||||
]
|
||||
);
|
||||
let counts = facet_collector.harvest();
|
||||
// This lists all of the facet counts
|
||||
let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![
|
||||
(&Facet::from("/pools/north"), 1),
|
||||
(&Facet::from("/pools/south"), 1),
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use tempdir::TempDir;
|
||||
|
||||
@@ -7,18 +7,15 @@
|
||||
// the list of documents containing a term, getting
|
||||
// its term frequency, and accessing its positions.
|
||||
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::{DocSet, DocId, Postings};
|
||||
use tantivy::{DocId, DocSet, Postings};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
|
||||
|
||||
// We first create a schema for the sake of the
|
||||
// example. Check the `basic_search` example for more information.
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -47,7 +44,6 @@ fn main() -> tantivy::Result<()> {
|
||||
// there is actually only one segment here, but let's iterate through the list
|
||||
// anyway)
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
|
||||
// A segment contains different data structure.
|
||||
// Inverted index stands for the combination of
|
||||
// - the term dictionary
|
||||
@@ -58,19 +54,18 @@ fn main() -> tantivy::Result<()> {
|
||||
// Let's go through all docs containing the term `title:the` and access their position
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
||||
// and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing less
|
||||
// information.
|
||||
if let Some(mut segment_postings) = inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) {
|
||||
|
||||
if let Some(mut segment_postings) =
|
||||
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)
|
||||
{
|
||||
// this buffer will be used to request for positions
|
||||
let mut positions: Vec<u32> = Vec::with_capacity(100);
|
||||
while segment_postings.advance() {
|
||||
|
||||
// the number of time the term appears in the document.
|
||||
let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.
|
||||
|
||||
@@ -98,7 +93,6 @@ fn main() -> tantivy::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// A `Term` is a text token associated with a field.
|
||||
// Let's go through all docs containing the term `title:the` and access their position
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
@@ -111,7 +105,6 @@ fn main() -> tantivy::Result<()> {
|
||||
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
|
||||
// you can get better performance by accessing directly the blocks of doc ids.
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
|
||||
// A segment contains different data structure.
|
||||
// Inverted index stands for the combination of
|
||||
// - the term dictionary
|
||||
@@ -124,7 +117,9 @@ fn main() -> tantivy::Result<()> {
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing less
|
||||
// information.
|
||||
if let Some(mut block_segment_postings) = inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) {
|
||||
if let Some(mut block_segment_postings) =
|
||||
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
|
||||
{
|
||||
while block_segment_postings.advance() {
|
||||
// Once again these docs MAY contains deleted documents as well.
|
||||
let docs = block_segment_postings.docs();
|
||||
@@ -136,4 +131,3 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -66,6 +66,6 @@ fn main() -> tantivy::Result<()> {
|
||||
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -22,59 +22,59 @@ use tantivy::tokenizer::*;
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// This configures your custom options for how tantivy will
|
||||
// store and process your content in the index; The key
|
||||
// to note is that we are setting the tokenizer to `stoppy`
|
||||
// which will be defined and registered below.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
// This configures your custom options for how tantivy will
|
||||
// store and process your content in the index; The key
|
||||
// to note is that we are setting the tokenizer to `stoppy`
|
||||
// which will be defined and registered below.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
|
||||
// Our first field is title.
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
// Our first field is title.
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("body", text_options);
|
||||
// Our second field is body.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("body", text_options);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
@@ -86,7 +86,7 @@ fn main() -> tantivy::Result<()> {
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
@@ -94,28 +94,28 @@ fn main() -> tantivy::Result<()> {
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
|
||||
index_writer.commit()?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// stop words are applied on the query as well.
|
||||
// The following will be equivalent to `title:frankenstein`
|
||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||
// stop words are applied on the query as well.
|
||||
// The following will be equivalent to `title:frankenstein`
|
||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
let doc_addresses = top_collector.docs();
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -342,16 +342,19 @@ impl FacetCollector {
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self
|
||||
.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
.collect();
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self
|
||||
.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||
.collect();
|
||||
|
||||
let facet_streams = self.segment_counters
|
||||
let facet_streams = self
|
||||
.segment_counters
|
||||
.iter()
|
||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||
.collect::<Vec<_>>();
|
||||
@@ -402,7 +405,8 @@ impl Collector for FacetCollector {
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self.ff_reader
|
||||
&mut *self
|
||||
.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
@@ -476,9 +480,8 @@ impl FacetCounts {
|
||||
heap.push(Hit { count, facet });
|
||||
}
|
||||
|
||||
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count)
|
||||
.unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value
|
||||
// is never used in that case.
|
||||
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value
|
||||
// is never used in that case.
|
||||
|
||||
for (facet, count) in it {
|
||||
if count > lowest_count {
|
||||
@@ -619,7 +622,13 @@ mod tests {
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
})
|
||||
.map(|mut doc| { doc.add_facet(facet_field, &format!("/facet/{}", thread_rng().sample(&uniform) )); doc})
|
||||
.map(|mut doc| {
|
||||
doc.add_facet(
|
||||
facet_field,
|
||||
&format!("/facet/{}", thread_rng().sample(&uniform)),
|
||||
);
|
||||
doc
|
||||
})
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use SegmentLocalId;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||
///
|
||||
@@ -139,9 +139,9 @@ impl<T: PartialOrd + Clone> TopCollector<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use DocId;
|
||||
use Score;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use super::Collector;
|
||||
use collector::top_collector::TopCollector;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::FastValue;
|
||||
use schema::Field;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use super::Collector;
|
||||
use schema::Field;
|
||||
|
||||
/// The Top Field Collector keeps track of the K documents
|
||||
/// sorted by a fast field in the index
|
||||
@@ -142,16 +142,16 @@ impl<T: FastValue + PartialOrd + Clone> Collector for TopFieldCollector<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use Index;
|
||||
use IndexWriter;
|
||||
use TantivyError;
|
||||
use super::*;
|
||||
use query::Query;
|
||||
use query::QueryParser;
|
||||
use schema::{FAST, SchemaBuilder, TEXT};
|
||||
use schema::Field;
|
||||
use schema::IntOptions;
|
||||
use schema::Schema;
|
||||
use super::*;
|
||||
use schema::{SchemaBuilder, FAST, TEXT};
|
||||
use Index;
|
||||
use IndexWriter;
|
||||
use TantivyError;
|
||||
|
||||
const TITLE: &str = "title";
|
||||
const SIZE: &str = "size";
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use super::Collector;
|
||||
use collector::top_collector::TopCollector;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
@@ -5,7 +6,6 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use super::Collector;
|
||||
|
||||
/// The Top Score Collector keeps track of the K documents
|
||||
/// sorted by their score.
|
||||
@@ -131,10 +131,10 @@ impl Collector for TopScoreCollector {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Score;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
|
||||
@@ -72,7 +72,8 @@ impl<W: Write> CompositeWrite<W> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
let mut offset_fields: Vec<_> = self
|
||||
.offsets
|
||||
.iter()
|
||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||
.collect();
|
||||
|
||||
@@ -10,8 +10,6 @@ pub struct VInt(pub u64);
|
||||
const STOP_BIT: u8 = 128;
|
||||
|
||||
impl VInt {
|
||||
|
||||
|
||||
pub fn val(&self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
@@ -20,14 +18,13 @@ impl VInt {
|
||||
VInt::deserialize(reader).map(|vint| vint.0)
|
||||
}
|
||||
|
||||
pub fn serialize_into_vec(&self, output: &mut Vec<u8>){
|
||||
pub fn serialize_into_vec(&self, output: &mut Vec<u8>) {
|
||||
let mut buffer = [0u8; 10];
|
||||
let num_bytes = self.serialize_into(&mut buffer);
|
||||
output.extend(&buffer[0..num_bytes]);
|
||||
}
|
||||
|
||||
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
||||
|
||||
let mut remaining = self.0;
|
||||
for (i, b) in buffer.iter_mut().enumerate() {
|
||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||
@@ -74,7 +71,6 @@ impl BinarySerializable for VInt {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -89,10 +85,10 @@ mod tests {
|
||||
}
|
||||
assert!(num_bytes > 0);
|
||||
if num_bytes < 10 {
|
||||
assert!(1u64 << (7*num_bytes) > val);
|
||||
assert!(1u64 << (7 * num_bytes) > val);
|
||||
}
|
||||
if num_bytes > 1 {
|
||||
assert!(1u64 << (7*(num_bytes-1)) <= val);
|
||||
assert!(1u64 << (7 * (num_bytes - 1)) <= val);
|
||||
}
|
||||
let serdeser_val = VInt::deserialize(&mut &v[..]).unwrap();
|
||||
assert_eq!(val, serdeser_val.0);
|
||||
@@ -105,11 +101,11 @@ mod tests {
|
||||
aux_test_vint(5);
|
||||
aux_test_vint(u64::max_value());
|
||||
for i in 1..9 {
|
||||
let power_of_128 = 1u64 << (7*i);
|
||||
let power_of_128 = 1u64 << (7 * i);
|
||||
aux_test_vint(power_of_128 - 1u64);
|
||||
aux_test_vint(power_of_128 );
|
||||
aux_test_vint(power_of_128);
|
||||
aux_test_vint(power_of_128 + 1u64);
|
||||
}
|
||||
aux_test_vint(10);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,36 +1,36 @@
|
||||
use core::SegmentId;
|
||||
use error::TantivyError;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use Result;
|
||||
use indexer::LockType;
|
||||
use super::pool::LeasedItem;
|
||||
use super::pool::Pool;
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use core::searcher::Searcher;
|
||||
use core::IndexMeta;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use core::META_FILEPATH;
|
||||
use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use directory::{ManagedDirectory};
|
||||
use error::TantivyError;
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use indexer::LockType;
|
||||
use num_cpus;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::TokenizerManager;
|
||||
use IndexWriter;
|
||||
use schema::FieldType;
|
||||
use schema::Field;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use Result;
|
||||
|
||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
@@ -115,31 +115,24 @@ impl Index {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
|
||||
/// Helper to access the tokenizer associated to a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> =
|
||||
match field_type {
|
||||
FieldType::Str(text_options) => {
|
||||
text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
|
||||
},
|
||||
_ => {
|
||||
None
|
||||
}
|
||||
};
|
||||
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
|
||||
_ => None,
|
||||
};
|
||||
match tokenizer_name_opt {
|
||||
Some(tokenizer) => {
|
||||
Ok(tokenizer)
|
||||
}
|
||||
None => {
|
||||
Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name())))
|
||||
}
|
||||
Some(tokenizer) => Ok(tokenizer),
|
||||
None => Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -186,7 +179,6 @@ impl Index {
|
||||
num_threads: usize,
|
||||
overall_heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
|
||||
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
@@ -225,7 +217,8 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
@@ -260,7 +253,8 @@ impl Index {
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
@@ -332,11 +326,10 @@ impl Clone for Index {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use schema::{SchemaBuilder, INT_INDEXED, TEXT};
|
||||
use Index;
|
||||
use schema::{SchemaBuilder, TEXT, INT_INDEXED};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -352,5 +345,4 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use common::BinarySerializable;
|
||||
use directory::ReadOnlySource;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::PositionReader;
|
||||
use postings::TermInfo;
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use schema::FieldType;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use termdict::TermDictionary;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::PositionReader;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -100,7 +100,6 @@ impl InvertedIndexReader {
|
||||
block_postings.reset(term_info.doc_freq, postings_reader);
|
||||
}
|
||||
|
||||
|
||||
/// Returns a block postings given a `Term`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
@@ -111,7 +110,7 @@ impl InvertedIndexReader {
|
||||
option: IndexRecordOption,
|
||||
) -> Option<BlockSegmentPostings> {
|
||||
self.get_term_info(term)
|
||||
.map(move|term_info| self.read_block_postings_from_terminfo(&term_info, option))
|
||||
.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
/// Returns a block postings given a `term_info`.
|
||||
@@ -147,7 +146,8 @@ impl InvertedIndexReader {
|
||||
if option.has_positions() {
|
||||
let position_reader = self.positions_source.clone();
|
||||
let skip_reader = self.positions_idx_source.clone();
|
||||
let position_reader = PositionReader::new(position_reader, skip_reader, term_info.positions_idx);
|
||||
let position_reader =
|
||||
PositionReader::new(position_reader, skip_reader, term_info.positions_idx);
|
||||
Some(position_reader)
|
||||
} else {
|
||||
None
|
||||
|
||||
@@ -87,7 +87,8 @@ impl<T> Deref for LeasedItem<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self.gen_item
|
||||
&self
|
||||
.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
@@ -96,7 +97,8 @@ impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self.gen_item
|
||||
&mut self
|
||||
.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
|
||||
@@ -9,8 +9,8 @@ use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use termdict::TermMerger;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use Index;
|
||||
use Result;
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
@@ -25,7 +25,11 @@ pub struct Searcher {
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
pub(crate) fn new(
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
) -> Searcher {
|
||||
Searcher {
|
||||
schema,
|
||||
index,
|
||||
@@ -87,7 +91,8 @@ impl Searcher {
|
||||
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||
let inv_index_readers = self.segment_readers
|
||||
let inv_index_readers = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -107,7 +112,8 @@ impl FieldSearcher {
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// for the given field.
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
let term_streamers: Vec<_> = self.inv_index_readers
|
||||
let term_streamers: Vec<_> = self
|
||||
.inv_index_readers
|
||||
.iter()
|
||||
.map(|inverted_index| inverted_index.terms().stream())
|
||||
.collect();
|
||||
@@ -117,7 +123,8 @@ impl FieldSearcher {
|
||||
|
||||
impl fmt::Debug for Searcher {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let segment_ids = self.segment_readers
|
||||
let segment_ids = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -157,11 +157,13 @@ impl SegmentReader {
|
||||
&FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self.fast_fields_composite
|
||||
let idx_reader = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self.fast_fields_composite
|
||||
let values = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
@@ -285,7 +287,8 @@ impl SegmentReader {
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
|
||||
if let Some(inv_idx_reader) = self
|
||||
.inv_idx_reader_cache
|
||||
.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
@@ -314,15 +317,18 @@ impl SegmentReader {
|
||||
|
||||
let postings_source = postings_source_opt.unwrap();
|
||||
|
||||
let termdict_source = self.termdict_composite
|
||||
let termdict_source = self
|
||||
.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
|
||||
let positions_source = self.positions_composite
|
||||
let positions_source = self
|
||||
.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let positions_idx_source = self.positions_idx_composite
|
||||
let positions_idx_source = self
|
||||
.positions_idx_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
@@ -435,7 +441,6 @@ mod test {
|
||||
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||
use DocId;
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
|
||||
@@ -77,15 +77,15 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
|
||||
/// DirectoryClone
|
||||
pub trait DirectoryClone {
|
||||
/// Clones the directory and boxes the clone
|
||||
fn box_clone(&self) -> Box<Directory>;
|
||||
/// Clones the directory and boxes the clone
|
||||
fn box_clone(&self) -> Box<Directory>;
|
||||
}
|
||||
|
||||
impl<T> DirectoryClone for T
|
||||
where
|
||||
T: 'static + Directory + Clone,
|
||||
T: 'static + Directory + Clone,
|
||||
{
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use core::MANAGED_FILEPATH;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use error::TantivyError;
|
||||
use indexer::LockType;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
@@ -12,9 +13,6 @@ use std::sync::RwLockWriteGuard;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use Directory;
|
||||
use Result;
|
||||
use indexer::LockType;
|
||||
|
||||
|
||||
|
||||
/// Returns true iff the file is "managed".
|
||||
/// Non-managed file are not subject to garbage collection.
|
||||
@@ -108,7 +106,8 @@ impl ManagedDirectory {
|
||||
//
|
||||
// releasing the lock as .delete() will use it too.
|
||||
{
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
let meta_informations_rlock = self
|
||||
.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
|
||||
@@ -157,7 +156,8 @@ impl ManagedDirectory {
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
let mut meta_informations_wlock = self
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
{
|
||||
@@ -186,9 +186,10 @@ impl ManagedDirectory {
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
// Files starting by "." (e.g. lock files) are not managed.
|
||||
if !is_managed(filepath) {
|
||||
return Ok(());
|
||||
return Ok(());
|
||||
}
|
||||
let mut meta_wlock = self.meta_informations
|
||||
let mut meta_wlock = self
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
|
||||
@@ -32,7 +32,8 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
}
|
||||
})?;
|
||||
|
||||
let meta_data = file.metadata()
|
||||
let meta_data = file
|
||||
.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
@@ -309,7 +310,8 @@ impl Directory for MmapDirectory {
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self.sync_directory()
|
||||
Ok(_) => self
|
||||
.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
|
||||
@@ -170,7 +170,8 @@ impl Directory for RAMDirectory {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self.fs
|
||||
let exists = self
|
||||
.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
@@ -195,9 +196,10 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| {
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string())))
|
||||
});
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or("Undefined".to_string())
|
||||
)));
|
||||
let path_buf = PathBuf::from(path);
|
||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
self.fs.write(path_buf, &Vec::new())?;
|
||||
|
||||
@@ -5,7 +5,6 @@ use fst::raw::MmapReadOnly;
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
use std::ops::Deref;
|
||||
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
/// These read objects are only in charge to deliver
|
||||
|
||||
16
src/error.rs
16
src/error.rs
@@ -4,9 +4,9 @@ use std::io;
|
||||
|
||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use indexer::LockType;
|
||||
use query;
|
||||
use schema;
|
||||
use indexer::LockType;
|
||||
use serde_json;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
@@ -21,7 +21,10 @@ pub enum TantivyError {
|
||||
#[fail(display = "file already exists: '{:?}'", _0)]
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// Failed to acquire file lock
|
||||
#[fail(display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)]
|
||||
#[fail(
|
||||
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
||||
_0
|
||||
)]
|
||||
LockFailure(LockType),
|
||||
/// IO Error.
|
||||
#[fail(display = "an IO error occurred: '{}'", _0)]
|
||||
@@ -95,14 +98,13 @@ impl From<schema::DocParsingError> for TantivyError {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<OpenWriteError> for TantivyError {
|
||||
fn from(error: OpenWriteError) -> TantivyError {
|
||||
match error {
|
||||
OpenWriteError::FileAlreadyExists(filepath) =>
|
||||
TantivyError::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) =>
|
||||
TantivyError::IOError(io_error),
|
||||
OpenWriteError::FileAlreadyExists(filepath) => {
|
||||
TantivyError::FileAlreadyExists(filepath)
|
||||
}
|
||||
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||
}.into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,7 +41,8 @@ pub struct DeleteBitSet {
|
||||
impl DeleteBitSet {
|
||||
/// Opens a delete bitset given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||
let num_deleted: usize = data.as_slice()
|
||||
let num_deleted: usize = data
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
|
||||
@@ -56,7 +56,8 @@ impl FacetReader {
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
let found_term = self.term_dict
|
||||
let found_term = self
|
||||
.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
}
|
||||
|
||||
@@ -132,7 +132,8 @@ impl MultiValueIntFastFieldWriter {
|
||||
);
|
||||
|
||||
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
||||
for (start, stop) in self.doc_index
|
||||
for (start, stop) in self
|
||||
.doc_index
|
||||
.windows(2)
|
||||
.map(|interval| (interval[0], interval[1]))
|
||||
.chain(Some(last_interval).into_iter())
|
||||
@@ -148,7 +149,6 @@ impl MultiValueIntFastFieldWriter {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
None => {
|
||||
let val_min_max = self.vals.iter().cloned().minmax();
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use rand::thread_rng;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use rand::Rng;
|
||||
use rand::distributions::Range;
|
||||
use rand::Rng;
|
||||
use schema::*;
|
||||
use Index;
|
||||
use Searcher;
|
||||
|
||||
@@ -52,7 +52,8 @@ impl DeleteQueue {
|
||||
//
|
||||
// Past delete operations are not accessible.
|
||||
pub fn cursor(&self) -> DeleteCursor {
|
||||
let last_block = self.inner
|
||||
let last_block = self
|
||||
.inner
|
||||
.read()
|
||||
.expect("Read lock poisoned when opening delete queue cursor")
|
||||
.last_block
|
||||
@@ -92,7 +93,8 @@ impl DeleteQueue {
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self.inner
|
||||
let mut self_wlock = self
|
||||
.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
|
||||
@@ -132,7 +134,8 @@ impl From<DeleteQueue> for NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self.0
|
||||
let next_read_lock = self
|
||||
.0
|
||||
.read()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||
@@ -141,7 +144,8 @@ impl NextBlock {
|
||||
}
|
||||
let next_block;
|
||||
{
|
||||
let mut next_write_lock = self.0
|
||||
let mut next_write_lock = self
|
||||
.0
|
||||
.write()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
match *next_write_lock {
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use directory::error::OpenWriteError;
|
||||
use Directory;
|
||||
use TantivyError;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::io::Write;
|
||||
use Directory;
|
||||
use TantivyError;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum LockType {
|
||||
@@ -29,10 +29,9 @@ pub enum LockType {
|
||||
/// is very simplistic. We retry after `100ms` until we effectively
|
||||
/// acquire the lock.
|
||||
/// This lock should not have much contention in normal usage.
|
||||
MetaLock
|
||||
MetaLock,
|
||||
}
|
||||
|
||||
|
||||
/// Retry the logic of acquiring locks is pretty simple.
|
||||
/// We just retry `n` times after a given `duratio`, both
|
||||
/// depending on the type of lock.
|
||||
@@ -49,7 +48,7 @@ impl RetryPolicy {
|
||||
}
|
||||
}
|
||||
|
||||
fn wait_and_retry(&mut self,) -> bool {
|
||||
fn wait_and_retry(&mut self) -> bool {
|
||||
if self.num_retries == 0 {
|
||||
false
|
||||
} else {
|
||||
@@ -58,35 +57,26 @@ impl RetryPolicy {
|
||||
thread::sleep(wait_duration);
|
||||
true
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
impl LockType {
|
||||
|
||||
fn retry_policy(&self) -> RetryPolicy {
|
||||
match *self {
|
||||
LockType::IndexWriterLock =>
|
||||
RetryPolicy::no_retry(),
|
||||
LockType::MetaLock =>
|
||||
RetryPolicy {
|
||||
num_retries: 100,
|
||||
wait_in_ms: 100,
|
||||
}
|
||||
LockType::IndexWriterLock => RetryPolicy::no_retry(),
|
||||
LockType::MetaLock => RetryPolicy {
|
||||
num_retries: 100,
|
||||
wait_in_ms: 100,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn try_acquire_lock(&self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
|
||||
let path = self.filename();
|
||||
let mut write = directory
|
||||
.open_write(path)
|
||||
.map_err(|e|
|
||||
match e {
|
||||
OpenWriteError::FileAlreadyExists(_) =>
|
||||
TantivyError::LockFailure(*self),
|
||||
OpenWriteError::IOError(io_error) =>
|
||||
TantivyError::IOError(io_error),
|
||||
})?;
|
||||
let mut write = directory.open_write(path).map_err(|e| match e {
|
||||
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(*self),
|
||||
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||
})?;
|
||||
write.flush()?;
|
||||
Ok(DirectoryLock {
|
||||
directory: directory.box_clone(),
|
||||
@@ -94,7 +84,6 @@ impl LockType {
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Acquire a lock in the given directory.
|
||||
pub fn acquire_lock(&self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
|
||||
let mut box_directory = directory.box_clone();
|
||||
@@ -110,25 +99,19 @@ impl LockType {
|
||||
return Err(TantivyError::LockFailure(filepath.to_owned()));
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
}
|
||||
Err(_) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn filename(&self) -> &Path {
|
||||
match *self {
|
||||
LockType::MetaLock => {
|
||||
Path::new(".tantivy-meta.lock")
|
||||
}
|
||||
LockType::IndexWriterLock => {
|
||||
Path::new(".tantivy-indexer.lock")
|
||||
}
|
||||
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
|
||||
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// The `DirectoryLock` is an object that represents a file lock.
|
||||
/// See [`LockType`](struct.LockType.html)
|
||||
///
|
||||
|
||||
@@ -347,7 +347,8 @@ impl IndexWriter {
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result = self.segment_updater
|
||||
let result = self
|
||||
.segment_updater
|
||||
.wait_merging_thread()
|
||||
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
|
||||
|
||||
@@ -494,7 +495,8 @@ impl IndexWriter {
|
||||
let document_receiver = self.document_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self._directory_lock
|
||||
let directory_lock = self
|
||||
._directory_lock
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
@@ -678,7 +680,7 @@ mod tests {
|
||||
let err_msg = err.to_string();
|
||||
assert!(err_msg.contains("Lockfile"));
|
||||
assert!(err_msg.contains("Possible causes:"))
|
||||
},
|
||||
}
|
||||
_ => panic!("Expected LockfileAlreadyExists error"),
|
||||
}
|
||||
}
|
||||
@@ -864,8 +866,7 @@ mod tests {
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(feature="no_fail"))]
|
||||
#[cfg(not(feature = "no_fail"))]
|
||||
#[test]
|
||||
fn test_write_commit_fails() {
|
||||
use fail;
|
||||
@@ -874,7 +875,7 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _ in 0..100 {
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
@@ -21,17 +21,17 @@ pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
|
||||
|
||||
/// MergePolicyClone
|
||||
pub trait MergePolicyClone {
|
||||
/// Returns a boxed clone of the MergePolicy.
|
||||
fn box_clone(&self) -> Box<MergePolicy>;
|
||||
/// Returns a boxed clone of the MergePolicy.
|
||||
fn box_clone(&self) -> Box<MergePolicy>;
|
||||
}
|
||||
|
||||
impl<T> MergePolicyClone for T
|
||||
where
|
||||
T: 'static + MergePolicy + Clone,
|
||||
T: 'static + MergePolicy + Clone,
|
||||
{
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Never merge segments.
|
||||
|
||||
@@ -440,7 +440,8 @@ impl IndexMerger {
|
||||
) -> Result<Option<TermOrdinalMapping>> {
|
||||
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
let field_readers = self.readers
|
||||
let field_readers = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| reader.inverted_index(indexed_field))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -51,7 +51,8 @@ impl SegmentRegister {
|
||||
}
|
||||
|
||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
|
||||
let mut segment_ids: Vec<SegmentMeta> = self
|
||||
.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
|
||||
@@ -143,6 +143,7 @@ extern crate fst;
|
||||
extern crate fst_regex;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate htmlescape;
|
||||
extern crate itertools;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate num_cpus;
|
||||
@@ -154,7 +155,6 @@ extern crate stable_deref_trait;
|
||||
extern crate tempdir;
|
||||
extern crate tempfile;
|
||||
extern crate uuid;
|
||||
extern crate htmlescape;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
@@ -183,7 +183,7 @@ mod macros;
|
||||
|
||||
pub use error::TantivyError;
|
||||
|
||||
#[deprecated(since="0.7.0", note="please use `tantivy::TantivyError` instead")]
|
||||
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
|
||||
pub use error::TantivyError as Error;
|
||||
|
||||
extern crate census;
|
||||
@@ -951,4 +951,3 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
/// Positions are stored in three parts and over two files.
|
||||
//
|
||||
/// The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
|
||||
@@ -24,13 +23,12 @@
|
||||
/// The long skip structure makes it possible to skip rapidly to the a checkpoint close to this
|
||||
/// value, and then skip normally.
|
||||
///
|
||||
|
||||
mod reader;
|
||||
mod serializer;
|
||||
|
||||
pub use self::reader::PositionReader;
|
||||
pub use self::serializer::PositionSerializer;
|
||||
use bitpacking::{BitPacker4x, BitPacker};
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
|
||||
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
||||
@@ -43,10 +41,10 @@ lazy_static! {
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use std::iter;
|
||||
use super::{PositionSerializer, PositionReader};
|
||||
use super::{PositionReader, PositionSerializer};
|
||||
use directory::ReadOnlySource;
|
||||
use positions::COMPRESSION_BLOCK_SIZE;
|
||||
use std::iter;
|
||||
|
||||
fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) {
|
||||
let mut skip_buffer = vec![];
|
||||
@@ -59,7 +57,10 @@ pub mod tests {
|
||||
}
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
(ReadOnlySource::from(stream_buffer), ReadOnlySource::from(skip_buffer))
|
||||
(
|
||||
ReadOnlySource::from(stream_buffer),
|
||||
ReadOnlySource::from(skip_buffer),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -103,7 +104,7 @@ pub mod tests {
|
||||
assert_eq!(skip.len(), 12);
|
||||
assert_eq!(stream.len(), 1168);
|
||||
|
||||
let mut position_reader = PositionReader::new(stream,skip, 0u64);
|
||||
let mut position_reader = PositionReader::new(stream, skip, 0u64);
|
||||
let mut buf = [0u32; 7];
|
||||
let mut c = 0;
|
||||
for _ in 0..100 {
|
||||
@@ -125,7 +126,7 @@ pub mod tests {
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 15_749);
|
||||
assert_eq!(stream.len(), 1_000_000);
|
||||
let mut position_reader = PositionReader::new(stream,skip, 128 * 1024);
|
||||
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024);
|
||||
let mut buf = [0u32; 1];
|
||||
position_reader.read(&mut buf);
|
||||
assert_eq!(buf[0], CONST_VAL);
|
||||
@@ -137,12 +138,17 @@ pub mod tests {
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 15_749);
|
||||
assert_eq!(stream.len(), 4_987_872);
|
||||
for &offset in &[10, 128 * 1024, 128 * 1024 - 1, 128 * 1024 + 7, 128 * 10 * 1024 + 10] {
|
||||
let mut position_reader = PositionReader::new(stream.clone(),skip.clone(), offset);
|
||||
for &offset in &[
|
||||
10,
|
||||
128 * 1024,
|
||||
128 * 1024 - 1,
|
||||
128 * 1024 + 7,
|
||||
128 * 10 * 1024 + 10,
|
||||
] {
|
||||
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset);
|
||||
let mut buf = [0u32; 1];
|
||||
position_reader.read(&mut buf);
|
||||
assert_eq!(buf[0], offset as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use bitpacking::{BitPacker4x, BitPacker};
|
||||
use owned_read::OwnedRead;
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use postings::compression::compressed_block_size;
|
||||
use directory::ReadOnlySource;
|
||||
use positions::COMPRESSION_BLOCK_SIZE;
|
||||
use positions::LONG_SKIP_IN_BLOCKS;
|
||||
use positions::LONG_SKIP_INTERVAL;
|
||||
use super::BIT_PACKER;
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use directory::ReadOnlySource;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::COMPRESSION_BLOCK_SIZE;
|
||||
use positions::LONG_SKIP_INTERVAL;
|
||||
use positions::LONG_SKIP_IN_BLOCKS;
|
||||
use postings::compression::compressed_block_size;
|
||||
|
||||
pub struct PositionReader {
|
||||
skip_read: OwnedRead,
|
||||
@@ -18,7 +18,6 @@ pub struct PositionReader {
|
||||
// of the block of the next int to read.
|
||||
}
|
||||
|
||||
|
||||
// `ahead` represents the offset of the block currently loaded
|
||||
// compared to the cursor of the actual stream.
|
||||
//
|
||||
@@ -32,7 +31,8 @@ fn read_impl(
|
||||
buffer: &mut [u32; 128],
|
||||
mut inner_offset: usize,
|
||||
num_bits: &[u8],
|
||||
output: &mut [u32]) -> usize {
|
||||
output: &mut [u32],
|
||||
) -> usize {
|
||||
let mut output_start = 0;
|
||||
let mut output_len = output.len();
|
||||
let mut ahead = 0;
|
||||
@@ -47,8 +47,7 @@ fn read_impl(
|
||||
output_start += available_len;
|
||||
inner_offset = 0;
|
||||
let num_bits = num_bits[ahead];
|
||||
BitPacker4x::new()
|
||||
.decompress(position, &mut buffer[..], num_bits);
|
||||
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
position = &position[block_len..];
|
||||
ahead += 1;
|
||||
@@ -56,11 +55,12 @@ fn read_impl(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl PositionReader {
|
||||
pub fn new(position_source: ReadOnlySource,
|
||||
skip_source: ReadOnlySource,
|
||||
offset: u64) -> PositionReader {
|
||||
pub fn new(
|
||||
position_source: ReadOnlySource,
|
||||
skip_source: ReadOnlySource,
|
||||
offset: u64,
|
||||
) -> PositionReader {
|
||||
let skip_len = skip_source.len();
|
||||
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
||||
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
||||
@@ -70,7 +70,8 @@ impl PositionReader {
|
||||
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
|
||||
let offset_num_bytes: u64 = {
|
||||
if long_skip_id > 0 {
|
||||
let mut long_skip_blocks: &[u8] = &long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
|
||||
let mut long_skip_blocks: &[u8] =
|
||||
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
|
||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
|
||||
} else {
|
||||
0
|
||||
@@ -79,13 +80,13 @@ impl PositionReader {
|
||||
let mut position_read = OwnedRead::new(position_source);
|
||||
position_read.advance(offset_num_bytes as usize);
|
||||
let mut skip_read = OwnedRead::new(skip_body);
|
||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||
let mut position_reader = PositionReader {
|
||||
skip_read,
|
||||
position_read,
|
||||
inner_offset: 0,
|
||||
buffer: Box::new([0u32; 128]),
|
||||
ahead: None
|
||||
ahead: None,
|
||||
};
|
||||
position_reader.skip(small_skip);
|
||||
position_reader
|
||||
@@ -108,7 +109,8 @@ impl PositionReader {
|
||||
self.buffer.as_mut(),
|
||||
self.inner_offset,
|
||||
&skip_data[1..],
|
||||
output));
|
||||
output,
|
||||
));
|
||||
}
|
||||
|
||||
/// Skip the next `skip_len` integer.
|
||||
@@ -118,23 +120,20 @@ impl PositionReader {
|
||||
///
|
||||
/// May panic if the end of the stream is reached.
|
||||
pub fn skip(&mut self, skip_len: usize) {
|
||||
|
||||
let skip_len_plus_inner_offset = skip_len + self.inner_offset;
|
||||
|
||||
let num_blocks_to_advance = skip_len_plus_inner_offset / COMPRESSION_BLOCK_SIZE;
|
||||
self.inner_offset = skip_len_plus_inner_offset % COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
self.ahead = self.ahead
|
||||
.and_then(|num_blocks| {
|
||||
if num_blocks >= num_blocks_to_advance {
|
||||
Some(num_blocks_to_advance - num_blocks_to_advance)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
self.ahead = self.ahead.and_then(|num_blocks| {
|
||||
if num_blocks >= num_blocks_to_advance {
|
||||
Some(num_blocks_to_advance - num_blocks_to_advance)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
let skip_len = self.skip_read
|
||||
.as_ref()[..num_blocks_to_advance]
|
||||
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|num_bit| num_bit as usize)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::io;
|
||||
use bitpacking::BitPacker;
|
||||
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||
use common::BinarySerializable;
|
||||
use super::BIT_PACKER;
|
||||
use bitpacking::BitPacker;
|
||||
use common::BinarySerializable;
|
||||
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||
use std::io;
|
||||
|
||||
pub struct PositionSerializer<W: io::Write> {
|
||||
write_stream: W,
|
||||
@@ -23,7 +23,7 @@ impl<W: io::Write> PositionSerializer<W> {
|
||||
buffer: vec![0u8; 128 * 4],
|
||||
num_ints: 0u64,
|
||||
long_skips: Vec::new(),
|
||||
cumulated_num_bits: 0u64
|
||||
cumulated_num_bits: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +31,6 @@ impl<W: io::Write> PositionSerializer<W> {
|
||||
self.num_ints
|
||||
}
|
||||
|
||||
|
||||
fn remaining_block_len(&self) -> usize {
|
||||
COMPRESSION_BLOCK_SIZE - self.block.len()
|
||||
}
|
||||
|
||||
@@ -28,14 +28,16 @@ impl BlockEncoder {
|
||||
|
||||
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) {
|
||||
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
|
||||
let written_size = self.bitpacker
|
||||
.compress_sorted(offset, block, &mut self.output[..], num_bits);
|
||||
let written_size =
|
||||
self.bitpacker
|
||||
.compress_sorted(offset, block, &mut self.output[..], num_bits);
|
||||
(num_bits, &self.output[..written_size])
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) {
|
||||
let num_bits = self.bitpacker.num_bits(block);
|
||||
let written_size = self.bitpacker
|
||||
let written_size = self
|
||||
.bitpacker
|
||||
.compress(block, &mut self.output[..], num_bits);
|
||||
(num_bits, &self.output[..written_size])
|
||||
}
|
||||
@@ -62,19 +64,21 @@ impl BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32, num_bits: u8) -> usize {
|
||||
pub fn uncompress_block_sorted(
|
||||
&mut self,
|
||||
compressed_data: &[u8],
|
||||
offset: u32,
|
||||
num_bits: u8,
|
||||
) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker.decompress_sorted(
|
||||
offset,
|
||||
&compressed_data,
|
||||
&mut self.output,
|
||||
num_bits,
|
||||
)
|
||||
self.bitpacker
|
||||
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker.decompress(&compressed_data, &mut self.output, num_bits)
|
||||
self.bitpacker
|
||||
.decompress(&compressed_data, &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -88,7 +92,6 @@ impl BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub trait VIntEncoder {
|
||||
/// Compresses an array of `u32` integers,
|
||||
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_ encoding)
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
#[inline(always)]
|
||||
pub fn compress_sorted<'a>(
|
||||
input: &[u32],
|
||||
output: &'a mut [u8],
|
||||
mut offset: u32,
|
||||
) -> &'a [u8] {
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
@@ -46,11 +42,7 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32,
|
||||
) -> usize {
|
||||
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
let num_els = output.len();
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
pub(crate) mod compression;
|
||||
/// Postings module
|
||||
///
|
||||
/// Postings, also called inverted lists, is the key datastructure
|
||||
@@ -11,18 +12,17 @@ mod postings_writer;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
mod serializer;
|
||||
pub(crate) mod compression;
|
||||
mod skip;
|
||||
mod stacker;
|
||||
mod term_info;
|
||||
mod skip;
|
||||
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
|
||||
use self::compression::COMPRESSION_BLOCK_SIZE;
|
||||
pub use self::postings::Postings;
|
||||
pub use self::term_info::TermInfo;
|
||||
pub(crate) use self::skip::SkipReader;
|
||||
use self::compression::{COMPRESSION_BLOCK_SIZE};
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
|
||||
@@ -71,8 +71,7 @@ pub mod tests {
|
||||
let mut segment = index.new_segment();
|
||||
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
|
||||
{
|
||||
let mut field_serializer = posting_serializer
|
||||
.new_field(text_field, 120 * 4).unwrap();
|
||||
let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap();
|
||||
field_serializer.new_term("abc".as_bytes()).unwrap();
|
||||
for doc_id in 0u32..120u32 {
|
||||
let delta_positions = vec![1, 2, 3, 2];
|
||||
@@ -512,13 +511,13 @@ pub mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
for _ in 0..posting_list_size {
|
||||
let mut doc = Document::default();
|
||||
if rng.gen_bool(1f64/ 15f64) {
|
||||
if rng.gen_bool(1f64 / 15f64) {
|
||||
doc.add_text(text_field, "a");
|
||||
}
|
||||
if rng.gen_bool(1f64/ 10f64) {
|
||||
if rng.gen_bool(1f64 / 10f64) {
|
||||
doc.add_text(text_field, "b");
|
||||
}
|
||||
if rng.gen_bool(1f64/ 5f64) {
|
||||
if rng.gen_bool(1f64 / 5f64) {
|
||||
doc.add_text(text_field, "c");
|
||||
}
|
||||
doc.add_text(text_field, "d");
|
||||
|
||||
@@ -94,7 +94,8 @@ impl MultiFieldPostingsWriter {
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
|
||||
.term_index
|
||||
.iter()
|
||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||
.collect();
|
||||
|
||||
@@ -107,7 +107,8 @@ impl Recorder for TermFrequencyRecorder {
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self.stack
|
||||
let mut doc_iter = self
|
||||
.stack
|
||||
.iter(heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use DocId;
|
||||
use common::BitSet;
|
||||
use common::HasLen;
|
||||
use postings::compression::compressed_block_size;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::PositionReader;
|
||||
use postings::compression::compressed_block_size;
|
||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::Postings;
|
||||
use owned_read::OwnedRead;
|
||||
use common::{VInt, BinarySerializable};
|
||||
use postings::USE_SKIP_INFO_LIMIT;
|
||||
use postings::SkipReader;
|
||||
use postings::USE_SKIP_INFO_LIMIT;
|
||||
use schema::IndexRecordOption;
|
||||
use positions::PositionReader;
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
|
||||
const EMPTY_ARR: [u8; 0] = [];
|
||||
|
||||
@@ -98,7 +98,7 @@ impl SegmentPostings {
|
||||
docs.len() as u32,
|
||||
OwnedRead::new(buffer),
|
||||
IndexRecordOption::Basic,
|
||||
IndexRecordOption::Basic
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
SegmentPostings::from_block_postings(block_segment_postings, None)
|
||||
}
|
||||
@@ -151,7 +151,11 @@ fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||
/// The target is assumed smaller or equal to the last element.
|
||||
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(target, block_docs);
|
||||
start.wrapping_add(block_docs[start..end].binary_search(&target).unwrap_or_else(|e| e))
|
||||
start.wrapping_add(
|
||||
block_docs[start..end]
|
||||
.binary_search(&target)
|
||||
.unwrap_or_else(|e| e),
|
||||
)
|
||||
}
|
||||
|
||||
impl DocSet for SegmentPostings {
|
||||
@@ -179,21 +183,20 @@ impl DocSet for SegmentPostings {
|
||||
// check if we need to go to the next block
|
||||
let need_positions = self.position_computer.is_some();
|
||||
let mut sum_freqs_skipped: u32 = 0;
|
||||
if !self.block_cursor
|
||||
.docs()
|
||||
.last()
|
||||
.map(|doc| *doc >= target)
|
||||
.unwrap_or(false) // there should always be at least a document in the block
|
||||
// since advance returned.
|
||||
if !self
|
||||
.block_cursor
|
||||
.docs()
|
||||
.last()
|
||||
.map(|doc| *doc >= target)
|
||||
.unwrap_or(false)
|
||||
// there should always be at least a document in the block
|
||||
// since advance returned.
|
||||
{
|
||||
// we are not in the right block.
|
||||
//
|
||||
// First compute all of the freqs skipped from the current block.
|
||||
if need_positions {
|
||||
sum_freqs_skipped = self.block_cursor
|
||||
.freqs()[self.cur..]
|
||||
.iter()
|
||||
.sum();
|
||||
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
||||
match self.block_cursor.skip_to(target) {
|
||||
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
||||
sum_freqs_skipped += block_skip_freqs;
|
||||
@@ -215,9 +218,13 @@ impl DocSet for SegmentPostings {
|
||||
let block_docs = self.block_cursor.docs();
|
||||
|
||||
debug_assert!(target >= self.doc());
|
||||
let new_cur = self.cur.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
let new_cur = self
|
||||
.cur
|
||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
if need_positions {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur].iter().sum::<u32>();
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||
.iter()
|
||||
.sum::<u32>();
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
@@ -330,7 +337,10 @@ pub struct BlockSegmentPostings {
|
||||
skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option<OwnedRead>, OwnedRead) {
|
||||
fn split_into_skips_and_postings(
|
||||
doc_freq: u32,
|
||||
mut data: OwnedRead,
|
||||
) -> (Option<OwnedRead>, OwnedRead) {
|
||||
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
||||
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
|
||||
let mut postings_data = data.clone();
|
||||
@@ -345,7 +355,7 @@ fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option<
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum BlockSegmentPostingsSkipResult {
|
||||
Terminated,
|
||||
Success(u32) //< number of term freqs to skip
|
||||
Success(u32), //< number of term freqs to skip
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
@@ -353,7 +363,7 @@ impl BlockSegmentPostings {
|
||||
doc_freq: u32,
|
||||
data: OwnedRead,
|
||||
record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption
|
||||
requested_option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let freq_reading_option = match (record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
@@ -362,11 +372,10 @@ impl BlockSegmentPostings {
|
||||
};
|
||||
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||
let skip_reader =
|
||||
match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option)
|
||||
};
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option),
|
||||
};
|
||||
let doc_freq = doc_freq as usize;
|
||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||
BlockSegmentPostings {
|
||||
@@ -450,7 +459,6 @@ impl BlockSegmentPostings {
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
|
||||
/// position on a block that may contains `doc_id`.
|
||||
/// Always advance the current block.
|
||||
///
|
||||
@@ -461,9 +469,7 @@ impl BlockSegmentPostings {
|
||||
/// Returns false iff all of the document remaining are smaller than
|
||||
/// `doc_id`. In that case, all of these document are consumed.
|
||||
///
|
||||
pub fn skip_to(&mut self,
|
||||
target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
||||
|
||||
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
||||
let mut skip_freqs = 0u32;
|
||||
while self.skip_reader.advance() {
|
||||
if self.skip_reader.doc() >= target_doc {
|
||||
@@ -472,11 +478,11 @@ impl BlockSegmentPostings {
|
||||
//
|
||||
// We found our block!
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder
|
||||
.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits);
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
@@ -486,9 +492,9 @@ impl BlockSegmentPostings {
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(),
|
||||
tf_num_bits);
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
@@ -518,7 +524,8 @@ impl BlockSegmentPostings {
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
return self.docs()
|
||||
return self
|
||||
.docs()
|
||||
.last()
|
||||
.map(|last_doc| {
|
||||
if *last_doc >= target_doc {
|
||||
@@ -538,11 +545,11 @@ impl BlockSegmentPostings {
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.skip_reader.advance() {
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder
|
||||
.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits);
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
@@ -552,9 +559,9 @@ impl BlockSegmentPostings {
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(),
|
||||
tf_num_bits);
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
@@ -594,7 +601,6 @@ impl BlockSegmentPostings {
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
|
||||
|
||||
remaining_data: OwnedRead::new(vec![]),
|
||||
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
|
||||
}
|
||||
@@ -616,7 +622,9 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::search_within_block;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
use super::SegmentPostings;
|
||||
use common::HasLen;
|
||||
use core::Index;
|
||||
@@ -626,9 +634,7 @@ mod tests {
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Term;
|
||||
use schema::INT_INDEXED;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
use DocId;
|
||||
use super::search_within_block;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -645,7 +651,6 @@ mod tests {
|
||||
assert_eq!(postings.doc_freq(), 0);
|
||||
}
|
||||
|
||||
|
||||
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||
block
|
||||
.iter()
|
||||
@@ -653,11 +658,15 @@ mod tests {
|
||||
.enumerate()
|
||||
.filter(|&(_, ref val)| *val >= target)
|
||||
.next()
|
||||
.unwrap().0
|
||||
.unwrap()
|
||||
.0
|
||||
}
|
||||
|
||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||
assert_eq!(search_within_block(block, target), search_within_block_trivial_but_slow(block, target));
|
||||
assert_eq!(
|
||||
search_within_block(block, target),
|
||||
search_within_block_trivial_but_slow(block, target)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block_all(block: &[u32]) {
|
||||
@@ -677,7 +686,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_search_within_block() {
|
||||
for len in 1u32..128u32 {
|
||||
let v: Vec<u32> = (0..len).map(|i| i*2).collect();
|
||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||
util_test_search_within_block_all(&v[..]);
|
||||
}
|
||||
}
|
||||
@@ -726,14 +735,22 @@ mod tests {
|
||||
fn test_block_segment_postings_skip() {
|
||||
for i in 0..4 {
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32));
|
||||
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Terminated);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||
);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
assert_eq!(block_postings.skip_to(4u32), BlockSegmentPostingsSkipResult::Terminated);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(4u32),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip2() {
|
||||
let mut docs = vec![0];
|
||||
@@ -741,14 +758,23 @@ mod tests {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(docs.clone());
|
||||
for i in vec![0, 424, 10000] {
|
||||
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32));
|
||||
for i in vec![0, 424, 10000] {
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||
);
|
||||
let docs = block_postings.docs();
|
||||
assert!(docs[0] <= i);
|
||||
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
|
||||
}
|
||||
assert_eq!(block_postings.skip_to(100_000), BlockSegmentPostingsSkipResult::Terminated);
|
||||
assert_eq!(block_postings.skip_to(101_000), BlockSegmentPostingsSkipResult::Terminated);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(100_000),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(101_000),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
use super::TermInfo;
|
||||
use common::{VInt, BinarySerializable};
|
||||
use common::{BinarySerializable, VInt};
|
||||
use common::{CompositeWrite, CountingWriter};
|
||||
use postings::compression::{VIntEncoder, BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use core::Segment;
|
||||
use directory::WritePtr;
|
||||
use positions::PositionSerializer;
|
||||
use postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use postings::skip::SkipSerializer;
|
||||
use postings::USE_SKIP_INFO_LIMIT;
|
||||
use schema::Schema;
|
||||
use schema::{Field, FieldEntry, FieldType};
|
||||
use std::io::{self, Write};
|
||||
use termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use DocId;
|
||||
use Result;
|
||||
use postings::USE_SKIP_INFO_LIMIT;
|
||||
use postings::skip::SkipSerializer;
|
||||
use positions::PositionSerializer;
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
@@ -104,7 +104,7 @@ impl InvertedIndexSerializer {
|
||||
term_dictionary_write,
|
||||
postings_write,
|
||||
positions_write,
|
||||
positionsidx_write
|
||||
positionsidx_write,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -135,7 +135,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
positionsidx_write: &'a mut CountingWriter<WritePtr>
|
||||
positionsidx_write: &'a mut CountingWriter<WritePtr>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
@@ -153,7 +153,8 @@ impl<'a> FieldSerializer<'a> {
|
||||
};
|
||||
let term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(term_dictionary_write, field_type)?;
|
||||
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
||||
let postings_serializer =
|
||||
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
||||
let positions_serializer_opt = if position_enabled {
|
||||
Some(PositionSerializer::new(positions_write, positionsidx_write))
|
||||
} else {
|
||||
@@ -171,14 +172,15 @@ impl<'a> FieldSerializer<'a> {
|
||||
}
|
||||
|
||||
fn current_term_info(&self) -> TermInfo {
|
||||
let positions_idx = self.positions_serializer_opt
|
||||
let positions_idx = self
|
||||
.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.positions_idx())
|
||||
.unwrap_or(0u64);
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
postings_offset: self.postings_serializer.addr(),
|
||||
positions_idx
|
||||
positions_idx,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -253,7 +255,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
struct Block {
|
||||
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
|
||||
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
|
||||
len: usize
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl Block {
|
||||
@@ -261,7 +263,7 @@ impl Block {
|
||||
Block {
|
||||
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
|
||||
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
|
||||
len: 0
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,9 +314,12 @@ pub struct PostingsSerializer<W: Write> {
|
||||
termfreq_sum_enabled: bool,
|
||||
}
|
||||
|
||||
|
||||
impl<W: Write> PostingsSerializer<W> {
|
||||
pub fn new(write: W, termfreq_enabled: bool, termfreq_sum_enabled: bool) -> PostingsSerializer<W> {
|
||||
pub fn new(
|
||||
write: W,
|
||||
termfreq_enabled: bool,
|
||||
termfreq_sum_enabled: bool,
|
||||
) -> PostingsSerializer<W> {
|
||||
PostingsSerializer {
|
||||
output_write: CountingWriter::wrap(write),
|
||||
|
||||
@@ -337,14 +342,16 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
.block_encoder
|
||||
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.block.last_doc();
|
||||
self.skip_write.write_doc(self.last_doc_id_encoded, num_bits);
|
||||
self.skip_write
|
||||
.write_doc(self.last_doc_id_encoded, num_bits);
|
||||
// last el block 0, offset block 1,
|
||||
self.postings_write.extend(block_encoded);
|
||||
}
|
||||
if self.termfreq_enabled {
|
||||
// encode the term_freqs
|
||||
let (num_bits, block_encoded): (u8, &[u8]) =
|
||||
self.block_encoder.compress_block_unsorted(&self.block.term_freqs());
|
||||
let (num_bits, block_encoded): (u8, &[u8]) = self
|
||||
.block_encoder
|
||||
.compress_block_unsorted(&self.block.term_freqs());
|
||||
self.postings_write.extend(block_encoded);
|
||||
self.skip_write.write_term_freq(num_bits);
|
||||
if self.termfreq_sum_enabled {
|
||||
@@ -375,13 +382,15 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self.block_encoder
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.termfreq_enabled {
|
||||
let block_encoded = self.block_encoder
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
.compress_vint_unsorted(self.block.term_freqs());
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
}
|
||||
@@ -392,7 +401,6 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
|
||||
self.output_write.write_all(skip_data)?;
|
||||
self.output_write.write_all(&self.postings_write[..])?;
|
||||
|
||||
} else {
|
||||
self.output_write.write_all(&self.postings_write[..])?;
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use DocId;
|
||||
use common::BinarySerializable;
|
||||
use owned_read::OwnedRead;
|
||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use schema::IndexRecordOption;
|
||||
use DocId;
|
||||
|
||||
pub struct SkipSerializer {
|
||||
buffer: Vec<u8>,
|
||||
@@ -18,8 +18,11 @@ impl SkipSerializer {
|
||||
}
|
||||
|
||||
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
|
||||
assert!(last_doc > self.prev_doc, "write_doc(...) called with non-increasing doc ids. \
|
||||
Did you forget to call clear maybe?");
|
||||
assert!(
|
||||
last_doc > self.prev_doc,
|
||||
"write_doc(...) called with non-increasing doc ids. \
|
||||
Did you forget to call clear maybe?"
|
||||
);
|
||||
let delta_doc = last_doc - self.prev_doc;
|
||||
self.prev_doc = last_doc;
|
||||
delta_doc.serialize(&mut self.buffer).unwrap();
|
||||
@@ -30,9 +33,10 @@ impl SkipSerializer {
|
||||
self.buffer.push(tf_num_bits);
|
||||
}
|
||||
|
||||
|
||||
pub fn write_total_term_freq(&mut self, tf_sum: u32) {
|
||||
tf_sum.serialize(&mut self.buffer).expect("Should never fail");
|
||||
tf_sum
|
||||
.serialize(&mut self.buffer)
|
||||
.expect("Should never fail");
|
||||
}
|
||||
|
||||
pub fn data(&self) -> &[u8] {
|
||||
@@ -103,33 +107,32 @@ impl SkipReader {
|
||||
} else {
|
||||
let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
|
||||
self.doc += doc_delta as DocId;
|
||||
self.doc_num_bits = self.owned_read.get(0);
|
||||
self.doc_num_bits = self.owned_read.get(0);
|
||||
match self.skip_info {
|
||||
IndexRecordOption::Basic => {
|
||||
self.owned_read.advance(1);
|
||||
}
|
||||
IndexRecordOption::WithFreqs=> {
|
||||
IndexRecordOption::WithFreqs => {
|
||||
self.tf_num_bits = self.owned_read.get(1);
|
||||
self.owned_read.advance(2);
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
self.tf_num_bits = self.owned_read.get(1);
|
||||
self.owned_read.advance(2);
|
||||
self.tf_sum = u32::deserialize(&mut self.owned_read)
|
||||
.expect("Failed reading tf_sum");
|
||||
self.tf_sum =
|
||||
u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum");
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{SkipReader, SkipSerializer};
|
||||
use super::IndexRecordOption;
|
||||
use super::{SkipReader, SkipSerializer};
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
#[test]
|
||||
@@ -171,4 +174,4 @@ mod tests {
|
||||
assert_eq!(skip_reader.doc_num_bits(), 5u8);
|
||||
assert!(!skip_reader.advance());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,8 @@ use query::TermQuery;
|
||||
use query::Weight;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use Result;
|
||||
use std::collections::BTreeSet;
|
||||
use Result;
|
||||
use Searcher;
|
||||
|
||||
/// The boolean query combines a set of queries
|
||||
@@ -41,9 +41,9 @@ impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
|
||||
}
|
||||
|
||||
impl Query for BooleanQuery {
|
||||
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
let sub_weights = self.subqueries
|
||||
let sub_weights = self
|
||||
.subqueries
|
||||
.iter()
|
||||
.map(|&(ref occur, ref subquery)| {
|
||||
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use super::Scorer;
|
||||
use DocSet;
|
||||
use Score;
|
||||
use DocId;
|
||||
use query::Query;
|
||||
use Result;
|
||||
use Searcher;
|
||||
use query::Weight;
|
||||
use DocId;
|
||||
use DocSet;
|
||||
use Result;
|
||||
use Score;
|
||||
use Searcher;
|
||||
use SegmentReader;
|
||||
|
||||
/// `EmptyQuery` is a dummy `Query` in which no document matches.
|
||||
|
||||
@@ -3,11 +3,11 @@ Query
|
||||
*/
|
||||
|
||||
mod all_query;
|
||||
mod empty_query;
|
||||
mod automaton_weight;
|
||||
mod bitset;
|
||||
mod bm25;
|
||||
mod boolean_query;
|
||||
mod empty_query;
|
||||
mod exclude;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
@@ -34,10 +34,10 @@ pub use self::union::Union;
|
||||
pub use self::vec_docset::VecDocSet;
|
||||
|
||||
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
||||
pub use self::empty_query::{EmptyQuery, EmptyWeight, EmptyScorer};
|
||||
pub use self::automaton_weight::AutomatonWeight;
|
||||
pub use self::bitset::BitSetDocSet;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
|
||||
@@ -46,4 +46,4 @@ pub fn compose_occur(left: Occur, right: Occur) -> Occur {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,8 @@ use query::bm25::BM25Weight;
|
||||
use query::Query;
|
||||
use query::Weight;
|
||||
use schema::{Field, Term};
|
||||
use Result;
|
||||
use std::collections::BTreeSet;
|
||||
use Result;
|
||||
|
||||
/// `PhraseQuery` matches a specific sequence of words.
|
||||
///
|
||||
|
||||
@@ -124,7 +124,8 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
score_needed: bool,
|
||||
) -> PhraseScorer<TPostings> {
|
||||
let max_offset = term_postings.iter()
|
||||
let max_offset = term_postings
|
||||
.iter()
|
||||
.map(|&(offset, _)| offset)
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
@@ -30,7 +30,6 @@ impl PhraseWeight {
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
let field = self.phrase_terms[0].1.field();
|
||||
|
||||
@@ -2,10 +2,10 @@ use super::Weight;
|
||||
use collector::Collector;
|
||||
use core::searcher::Searcher;
|
||||
use downcast;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
use std::collections::BTreeSet;
|
||||
use Term;
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use super::user_input_ast::*;
|
||||
use combine::char::*;
|
||||
use combine::*;
|
||||
use combine::stream::StreamErrorFor;
|
||||
use combine::error::StreamError;
|
||||
use combine::stream::StreamErrorFor;
|
||||
use combine::*;
|
||||
use query::occur::Occur;
|
||||
use query::query_parser::user_input_ast::UserInputBound;
|
||||
|
||||
@@ -123,7 +123,8 @@ parser! {
|
||||
}
|
||||
|
||||
enum BinaryOperand {
|
||||
Or, And
|
||||
Or,
|
||||
And,
|
||||
}
|
||||
|
||||
parser! {
|
||||
@@ -138,19 +139,16 @@ parser! {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
enum Element {
|
||||
SingleEl(UserInputAST),
|
||||
NormalDisjunctive(Vec<Vec<UserInputAST>>)
|
||||
NormalDisjunctive(Vec<Vec<UserInputAST>>),
|
||||
}
|
||||
|
||||
impl Element {
|
||||
pub fn into_dnf(self) -> Vec<Vec<UserInputAST>> {
|
||||
match self {
|
||||
Element::NormalDisjunctive(conjunctions) =>
|
||||
conjunctions,
|
||||
Element::SingleEl(el) =>
|
||||
vec!(vec!(el)),
|
||||
Element::NormalDisjunctive(conjunctions) => conjunctions,
|
||||
Element::SingleEl(el) => vec![vec![el]],
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -227,10 +225,12 @@ mod test {
|
||||
assert!(parse_to_ast().parse(query).is_err());
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_not_op() {
|
||||
assert_eq!(format!("{:?}", parse_to_ast().parse("NOT")), "Err(UnexpectedParse)");
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("NOT")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
|
||||
test_parse_query_to_ast_helper("NOT a", "-(\"a\")");
|
||||
}
|
||||
@@ -241,10 +241,22 @@ mod test {
|
||||
test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))");
|
||||
test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))");
|
||||
test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))");
|
||||
assert_eq!(format!("{:?}", parse_to_ast().parse("a OR b aaa")), "Err(UnexpectedParse)");
|
||||
assert_eq!(format!("{:?}", parse_to_ast().parse("a AND b aaa")), "Err(UnexpectedParse)");
|
||||
assert_eq!(format!("{:?}", parse_to_ast().parse("aaa a OR b ")), "Err(UnexpectedParse)");
|
||||
assert_eq!(format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), "Err(UnexpectedParse)");
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("a OR b aaa")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("a AND b aaa")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("aaa a OR b ")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
use super::logical_ast::*;
|
||||
use super::query_grammar::parse_to_ast;
|
||||
use super::user_input_ast::*;
|
||||
use combine::Parser;
|
||||
use core::Index;
|
||||
use query::occur::compose_occur;
|
||||
use query::query_parser::logical_ast::LogicalAST;
|
||||
use query::AllQuery;
|
||||
use query::BooleanQuery;
|
||||
use query::EmptyQuery;
|
||||
use query::Occur;
|
||||
use query::occur::compose_occur;
|
||||
use query::PhraseQuery;
|
||||
use query::Query;
|
||||
use query::RangeQuery;
|
||||
@@ -18,10 +21,6 @@ use std::num::ParseIntError;
|
||||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
use tokenizer::TokenizerManager;
|
||||
use combine::Parser;
|
||||
use query::EmptyQuery;
|
||||
use query::query_parser::logical_ast::LogicalAST;
|
||||
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
@@ -59,23 +58,24 @@ impl From<ParseIntError> for QueryParserError {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Recursively remove empty clause from the AST
|
||||
///
|
||||
/// Returns `None` iff the `logical_ast` ended up being empty.
|
||||
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(children) => {
|
||||
let trimmed_children = children.into_iter()
|
||||
.flat_map(|(occur, child)|
|
||||
trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
|
||||
let trimmed_children = children
|
||||
.into_iter()
|
||||
.flat_map(|(occur, child)| {
|
||||
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
if trimmed_children.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(LogicalAST::Clause(trimmed_children))
|
||||
}
|
||||
},
|
||||
}
|
||||
_ => Some(logical_ast),
|
||||
}
|
||||
}
|
||||
@@ -188,8 +188,9 @@ impl QueryParser {
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
||||
let (user_input_ast, _remaining) =
|
||||
parse_to_ast().parse(query).map_err(|_| QueryParserError::SyntaxError)?;
|
||||
let (user_input_ast, _remaining) = parse_to_ast()
|
||||
.parse(query)
|
||||
.map_err(|_| QueryParserError::SyntaxError)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
@@ -291,12 +292,9 @@ impl QueryParser {
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let terms = self.compute_terms_for_string(field, phrase)?;
|
||||
match &terms[..] {
|
||||
[] =>
|
||||
Ok(None),
|
||||
[(_, term)] =>
|
||||
Ok(Some(LogicalLiteral::Term(term.clone()))),
|
||||
_ =>
|
||||
Ok(Some(LogicalLiteral::Phrase(terms.clone()))),
|
||||
[] => Ok(None),
|
||||
[(_, term)] => Ok(Some(LogicalLiteral::Term(term.clone()))),
|
||||
_ => Ok(Some(LogicalLiteral::Phrase(terms.clone()))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -308,7 +306,11 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_bound(&self, field: Field, bound: &UserInputBound) -> Result<Bound<Term>, QueryParserError> {
|
||||
fn resolve_bound(
|
||||
&self,
|
||||
field: Field,
|
||||
bound: &UserInputBound,
|
||||
) -> Result<Bound<Term>, QueryParserError> {
|
||||
if bound.term_str() == "*" {
|
||||
return Ok(Bound::Unbounded);
|
||||
}
|
||||
@@ -355,18 +357,21 @@ impl QueryParser {
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
}
|
||||
UserInputAST::Unary(left_occur, subquery) => {
|
||||
let (right_occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
|
||||
let (right_occur, logical_sub_queries) =
|
||||
self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((compose_occur(left_occur, right_occur), logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Leaf(leaf) => {
|
||||
UserInputAST::Leaf(leaf) => {
|
||||
let result_ast = self.compute_logical_ast_from_leaf(*leaf)?;
|
||||
Ok((Occur::Should, result_ast))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn compute_logical_ast_from_leaf(&self, leaf: UserInputLeaf) -> Result<LogicalAST, QueryParserError> {
|
||||
fn compute_logical_ast_from_leaf(
|
||||
&self,
|
||||
leaf: UserInputLeaf,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
match leaf {
|
||||
UserInputLeaf::Literal(literal) => {
|
||||
let term_phrases: Vec<(Field, String)> = match literal.field_name {
|
||||
@@ -391,21 +396,19 @@ impl QueryParser {
|
||||
asts.push(LogicalAST::Leaf(Box::new(ast)));
|
||||
}
|
||||
}
|
||||
let result_ast: LogicalAST =
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(
|
||||
asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
let result_ast: LogicalAST = if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
|
||||
};
|
||||
Ok(result_ast)
|
||||
}
|
||||
UserInputLeaf::All => {
|
||||
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All)))
|
||||
}
|
||||
UserInputLeaf::Range { field, lower, upper } => {
|
||||
UserInputLeaf::All => Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))),
|
||||
UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
upper,
|
||||
} => {
|
||||
let fields = self.resolved_fields(&field)?;
|
||||
let mut clauses = fields
|
||||
.iter()
|
||||
@@ -433,14 +436,15 @@ impl QueryParser {
|
||||
Ok(result_ast)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
match logical_literal {
|
||||
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
|
||||
LogicalLiteral::Phrase(term_with_offsets) => Box::new(PhraseQuery::new_with_offset(term_with_offsets)),
|
||||
LogicalLiteral::Phrase(term_with_offsets) => {
|
||||
Box::new(PhraseQuery::new_with_offset(term_with_offsets))
|
||||
}
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
@@ -458,11 +462,16 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
|
||||
assert!(
|
||||
!occur_subqueries.is_empty(),
|
||||
"Should not be empty after trimming"
|
||||
);
|
||||
Box::new(BooleanQuery::from(occur_subqueries))
|
||||
},
|
||||
Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
|
||||
None => Box::new(EmptyQuery)
|
||||
}
|
||||
Some(LogicalAST::Leaf(trimmed_logical_literal)) => {
|
||||
convert_literal_to_query(*trimmed_logical_literal)
|
||||
}
|
||||
None => Box::new(EmptyQuery),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -475,7 +484,7 @@ mod test {
|
||||
use schema::Field;
|
||||
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
|
||||
use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
|
||||
use tokenizer::{LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager};
|
||||
use Index;
|
||||
|
||||
fn make_query_parser() -> QueryParser {
|
||||
@@ -498,9 +507,11 @@ mod test {
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title, text];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()])),
|
||||
);
|
||||
QueryParser::new(schema, default_fields, tokenizer_manager)
|
||||
}
|
||||
@@ -571,16 +582,8 @@ mod test {
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_empty() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"",
|
||||
"<emptyclause>",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
" ",
|
||||
"<emptyclause>",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper("", "<emptyclause>", false);
|
||||
test_parse_query_to_logical_ast_helper(" ", "<emptyclause>", false);
|
||||
let query_parser = make_query_parser();
|
||||
let query_result = query_parser.parse_query("");
|
||||
let query = query_result.unwrap();
|
||||
@@ -693,11 +696,7 @@ mod test {
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"*",
|
||||
"*",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper("*", "*", false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -16,9 +16,7 @@ pub enum UserInputLeaf {
|
||||
impl Debug for UserInputLeaf {
|
||||
fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
match self {
|
||||
UserInputLeaf::Literal(literal) => {
|
||||
literal.fmt(formatter)
|
||||
}
|
||||
UserInputLeaf::Literal(literal) => literal.fmt(formatter),
|
||||
UserInputLeaf::Range {
|
||||
ref field,
|
||||
ref lower,
|
||||
@@ -82,13 +80,12 @@ impl UserInputBound {
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<UserInputAST>),
|
||||
Unary(Occur, Box<UserInputAST>),
|
||||
// Not(Box<UserInputAST>),
|
||||
// Should(Box<UserInputAST>),
|
||||
// Must(Box<UserInputAST>),
|
||||
// Not(Box<UserInputAST>),
|
||||
// Should(Box<UserInputAST>),
|
||||
// Must(Box<UserInputAST>),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
}
|
||||
|
||||
|
||||
impl UserInputAST {
|
||||
pub fn unary(self, occur: Occur) -> UserInputAST {
|
||||
UserInputAST::Unary(occur, Box::new(self))
|
||||
@@ -100,12 +97,10 @@ impl UserInputAST {
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap() //< safe
|
||||
} else {
|
||||
UserInputAST::Clause(asts
|
||||
.into_iter()
|
||||
.map(|ast: UserInputAST|
|
||||
ast.unary(occur)
|
||||
)
|
||||
.collect::<Vec<_>>()
|
||||
UserInputAST::Clause(
|
||||
asts.into_iter()
|
||||
.map(|ast: UserInputAST| ast.unary(occur))
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -117,11 +112,8 @@ impl UserInputAST {
|
||||
pub fn or(asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
UserInputAST::compose(Occur::Should, asts)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
impl UserInputAST {
|
||||
|
||||
|
||||
@@ -274,7 +274,6 @@ impl RangeWeight {
|
||||
}
|
||||
|
||||
impl Weight for RangeWeight {
|
||||
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
@@ -370,9 +369,7 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let count_multiples = |range_query: RangeQuery| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
range_query
|
||||
.search(&searcher, &mut count_collector)
|
||||
.unwrap();
|
||||
range_query.search(&searcher, &mut count_collector).unwrap();
|
||||
count_collector.count()
|
||||
};
|
||||
|
||||
|
||||
@@ -50,7 +50,6 @@ impl Scorer for Box<Scorer> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Wraps a `DocSet` and simply returns a constant `Scorer`.
|
||||
/// The `ConstScorer` is useful if you have a `DocSet` where
|
||||
/// you needed a scorer.
|
||||
|
||||
@@ -3,10 +3,10 @@ use query::bm25::BM25Weight;
|
||||
use query::Query;
|
||||
use query::Weight;
|
||||
use schema::IndexRecordOption;
|
||||
use std::collections::BTreeSet;
|
||||
use Result;
|
||||
use Searcher;
|
||||
use Term;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
/// containing a specific term.
|
||||
|
||||
@@ -444,7 +444,10 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
|
||||
assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton"));
|
||||
assert_eq!(
|
||||
doc.get_first(author_field).unwrap().text(),
|
||||
Some("fulmicoton")
|
||||
);
|
||||
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
|
||||
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
|
||||
}
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use htmlescape::encode_minimal;
|
||||
use std::collections::BTreeMap;
|
||||
use tokenizer::{Token, TokenStream};
|
||||
use Result;
|
||||
use query::Query;
|
||||
use Searcher;
|
||||
use schema::Field;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::{Token, TokenStream};
|
||||
use Document;
|
||||
use std::cmp::Ordering;
|
||||
use Result;
|
||||
use Searcher;
|
||||
|
||||
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
||||
|
||||
@@ -75,11 +75,10 @@ const HIGHLIGHTEN_PREFIX: &str = "<b>";
|
||||
const HIGHLIGHTEN_POSTFIX: &str = "</b>";
|
||||
|
||||
impl Snippet {
|
||||
|
||||
pub fn empty() -> Snippet {
|
||||
Snippet {
|
||||
fragments: String::new(),
|
||||
highlighted: Vec::new()
|
||||
highlighted: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,16 +156,17 @@ fn select_best_fragment_combination<'a>(
|
||||
fragments: Vec<FragmentCandidate>,
|
||||
text: &'a str,
|
||||
) -> Snippet {
|
||||
let best_fragment_opt = fragments
|
||||
.iter()
|
||||
.max_by(|left, right| {
|
||||
let cmp_score = left.score.partial_cmp(&right.score).unwrap_or(Ordering::Equal);
|
||||
if cmp_score == Ordering::Equal {
|
||||
(right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset))
|
||||
} else {
|
||||
cmp_score
|
||||
}
|
||||
});
|
||||
let best_fragment_opt = fragments.iter().max_by(|left, right| {
|
||||
let cmp_score = left
|
||||
.score
|
||||
.partial_cmp(&right.score)
|
||||
.unwrap_or(Ordering::Equal);
|
||||
if cmp_score == Ordering::Equal {
|
||||
(right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset))
|
||||
} else {
|
||||
cmp_score
|
||||
}
|
||||
});
|
||||
if let Some(fragment) = best_fragment_opt {
|
||||
let fragment_text = &text[fragment.start_offset..fragment.stop_offset];
|
||||
let highlighted = fragment
|
||||
@@ -177,7 +177,8 @@ fn select_best_fragment_combination<'a>(
|
||||
item.start - fragment.start_offset,
|
||||
item.stop - fragment.start_offset,
|
||||
)
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
Snippet {
|
||||
fragments: fragment_text.to_string(),
|
||||
highlighted: highlighted,
|
||||
@@ -239,17 +240,16 @@ pub struct SnippetGenerator {
|
||||
terms_text: BTreeMap<String, f32>,
|
||||
tokenizer: Box<BoxedTokenizer>,
|
||||
field: Field,
|
||||
max_num_chars: usize
|
||||
max_num_chars: usize,
|
||||
}
|
||||
|
||||
impl SnippetGenerator {
|
||||
/// Creates a new snippet generator
|
||||
pub fn new(searcher: &Searcher,
|
||||
query: &Query,
|
||||
field: Field) -> Result<SnippetGenerator> {
|
||||
pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
|
||||
let mut terms = BTreeSet::new();
|
||||
query.query_terms(&mut terms);
|
||||
let terms_text: BTreeMap<String, f32> = terms.into_iter()
|
||||
let terms_text: BTreeMap<String, f32> = terms
|
||||
.into_iter()
|
||||
.filter(|term| term.field() == field)
|
||||
.map(|term| (term.text().to_string(), 1f32))
|
||||
.collect();
|
||||
@@ -258,7 +258,7 @@ impl SnippetGenerator {
|
||||
terms_text,
|
||||
tokenizer,
|
||||
field,
|
||||
max_num_chars: DEFAULT_MAX_NUM_CHARS
|
||||
max_num_chars: DEFAULT_MAX_NUM_CHARS,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -272,7 +272,8 @@ impl SnippetGenerator {
|
||||
/// This method extract the text associated to the `SnippetGenerator`'s field
|
||||
/// and computes a snippet.
|
||||
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
|
||||
let text: String = doc.get_all(self.field)
|
||||
let text: String = doc
|
||||
.get_all(self.field)
|
||||
.into_iter()
|
||||
.flat_map(|val| val.text())
|
||||
.collect::<Vec<&str>>()
|
||||
@@ -282,10 +283,12 @@ impl SnippetGenerator {
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippet(&self, text: &str) -> Snippet {
|
||||
let fragment_candidates = search_fragments(&*self.tokenizer,
|
||||
&text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars);
|
||||
let fragment_candidates = search_fragments(
|
||||
&*self.tokenizer,
|
||||
&text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
);
|
||||
select_best_fragment_combination(fragment_candidates, &text)
|
||||
}
|
||||
}
|
||||
@@ -293,16 +296,16 @@ impl SnippetGenerator {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{search_fragments, select_best_fragment_combination};
|
||||
use query::QueryParser;
|
||||
use schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions};
|
||||
use std::collections::BTreeMap;
|
||||
use std::iter::Iterator;
|
||||
use tokenizer::{box_tokenizer, SimpleTokenizer};
|
||||
use Index;
|
||||
use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing};
|
||||
use SnippetGenerator;
|
||||
use query::QueryParser;
|
||||
|
||||
|
||||
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which
|
||||
const TEST_TEXT: &'static str =
|
||||
r#"Rust is a systems programming language sponsored by Mozilla which
|
||||
describes it as a "safe, concurrent, practical language", supporting functional and
|
||||
imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],
|
||||
but its designers intend it to provide better memory safety while still maintaining
|
||||
@@ -431,7 +434,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -442,12 +445,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_generator() {
|
||||
let mut schema_builder = SchemaBuilder::default ();
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default()
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("en_stem")
|
||||
.set_index_option(IndexRecordOption::Basic)
|
||||
);
|
||||
.set_index_option(IndexRecordOption::Basic),
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -474,6 +477,5 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +109,13 @@ pub mod tests {
|
||||
let store = StoreReader::from_source(store_source);
|
||||
for i in 0..1_000 {
|
||||
assert_eq!(
|
||||
*store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(),
|
||||
*store
|
||||
.get(i)
|
||||
.unwrap()
|
||||
.get_first(field_title)
|
||||
.unwrap()
|
||||
.text()
|
||||
.unwrap(),
|
||||
format!("Doc {}", i)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -72,7 +72,8 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
let mut skip_pointer = self.data_layer.insert(key, dest)?;
|
||||
loop {
|
||||
skip_pointer = match skip_pointer {
|
||||
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
|
||||
Some((skip_doc_id, skip_offset)) => self
|
||||
.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset)?,
|
||||
None => {
|
||||
return Ok(());
|
||||
|
||||
@@ -59,7 +59,6 @@ impl TermInfoBlockMeta {
|
||||
}
|
||||
|
||||
fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo {
|
||||
|
||||
let num_bits = self.num_bits() as usize;
|
||||
let mut cursor = num_bits * inner_offset;
|
||||
|
||||
|
||||
@@ -164,7 +164,8 @@ impl TermDictionary {
|
||||
let fst = self.fst_index.as_fst();
|
||||
let mut node = fst.root();
|
||||
while ord != 0 || !node.is_final() {
|
||||
if let Some(transition) = node.transitions()
|
||||
if let Some(transition) = node
|
||||
.transitions()
|
||||
.take_while(|transition| transition.out.value() <= ord)
|
||||
.last()
|
||||
{
|
||||
|
||||
@@ -31,7 +31,6 @@ fn to_lowercase_unicode(text: &mut String, output: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
@@ -50,7 +49,7 @@ where
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
@@ -68,41 +67,43 @@ where
|
||||
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
|
||||
LowerCaserTokenStream {
|
||||
tail,
|
||||
buffer: String::with_capacity(100)
|
||||
buffer: String::with_capacity(100),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tokenizer::Tokenizer;
|
||||
use tokenizer::LowerCaser;
|
||||
use tokenizer::TokenStream;
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use tokenizer::TokenStream;
|
||||
use tokenizer::Tokenizer;
|
||||
|
||||
#[test]
|
||||
fn test_to_lower_case() {
|
||||
assert_eq!(lowercase_helper("Русский текст"),
|
||||
vec!["русский".to_string(), "текст".to_string()]);
|
||||
assert_eq!(
|
||||
lowercase_helper("Русский текст"),
|
||||
vec!["русский".to_string(), "текст".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
fn lowercase_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = vec![];
|
||||
let mut token_stream = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.token_stream(text);
|
||||
let mut token_stream = SimpleTokenizer.filter(LowerCaser).token_stream(text);
|
||||
while token_stream.advance() {
|
||||
let token_text = token_stream.token().text.clone();
|
||||
tokens.push(token_text);
|
||||
}
|
||||
tokens
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lowercaser() {
|
||||
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
|
||||
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
|
||||
assert_eq!(
|
||||
lowercase_helper("Русский"),
|
||||
vec!["русский".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,8 +151,8 @@ pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::Stemmer;
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
pub(crate) use self::tokenizer::box_tokenizer;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
@@ -18,7 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
|
||||
offset_to: text.len(),
|
||||
position: 0,
|
||||
text: text.to_string(),
|
||||
position_length: 1
|
||||
position_length: 1,
|
||||
};
|
||||
RawTokenStream {
|
||||
token,
|
||||
|
||||
@@ -71,13 +71,16 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::POSITION_GAP;
|
||||
use super::super::{SimpleTokenizer, TokenStream, Tokenizer};
|
||||
use super::TokenStreamChain;
|
||||
use super::super::{Tokenizer, TokenStream, SimpleTokenizer};
|
||||
use super::POSITION_GAP;
|
||||
|
||||
#[test]
|
||||
fn test_chain_first_emits_no_tokens() {
|
||||
let token_streams = vec![SimpleTokenizer.token_stream(""), SimpleTokenizer.token_stream("hello world")];
|
||||
let token_streams = vec![
|
||||
SimpleTokenizer.token_stream(""),
|
||||
SimpleTokenizer.token_stream("hello world"),
|
||||
];
|
||||
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams);
|
||||
|
||||
assert!(token_chain.advance());
|
||||
@@ -91,8 +94,8 @@ mod tests {
|
||||
assert_eq!(token_chain.token().offset_from, 6);
|
||||
assert_eq!(token_chain.token().offset_to, 11);
|
||||
assert_eq!(token_chain.token().position, POSITION_GAP);
|
||||
|
||||
|
||||
assert!(!token_chain.advance());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,7 +276,7 @@ mod test {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
text: "abc".to_string(),
|
||||
position_length: 1
|
||||
position_length: 1,
|
||||
};
|
||||
let t2 = t1.clone();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user