From 07d87e154bdbe6dc4548ea9aef773846a8b1322e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 30 Nov 2018 22:46:59 +0900 Subject: [PATCH] Collector refactoring and multithreaded search (#437) * Split Collector into an overall Collector and a per-segment SegmentCollector. Precursor to cross-segment parallelism, and as a side benefit cleans up any per-segment fields from being Option to just T. * Attempt to add MultiCollector back * working. Chained collector is broken though * Fix chained collector * Fix test * Make Weight Send+Sync for parallelization purposes * Expose parameters of RangeQuery for external usage * Removed &mut self * fixing tests * Restored TestCollectors * blop * multicollector working * chained collector working * test broken * fixing unit test * blop * blop * Blop * simplifying APi * blop * better syntax * Simplifying top_collector * refactoring * blop * Sync with master * Added multithread search * Collector refactoring * Schema::builder * CR and rustdoc * CR comments * blop * Added an executor * Sorted the segment readers in the searcher * Update searcher.rs * Fixed unit testst * changed the place where we have the sort-segment-by-count heuristic * using crossbeam::channel * inlining * Comments about panics propagating * Added unit test for executor panicking * Readded default * Removed Default impl * Added unit test for executor --- CHANGELOG.md | 8 + Cargo.toml | 3 +- examples/basic_search.rs | 17 +- examples/custom_collector.rs | 189 +++++++++ examples/custom_tokenizer.rs | 10 +- examples/deleting_updating_documents.rs | 9 +- examples/faceted_search.rs | 7 +- examples/iterating_docs_and_positions.rs | 2 +- examples/snippet.rs | 11 +- examples/stop_words.rs | 14 +- examples/working_with_json.rs | 2 +- src/collector/chained_collector.rs | 142 ------- src/collector/count_collector.rs | 102 +++-- src/collector/facet_collector.rs | 319 ++++++-------- src/collector/int_facet_collector.rs | 2 +- src/collector/mod.rs | 505 ++++++++++++++--------- src/collector/multi_collector.rs | 267 +++++++++--- src/collector/tests.rs | 202 +++++++++ src/collector/top_collector.rs | 217 +++++----- src/collector/top_field_collector.rs | 236 +++++------ src/collector/top_score_collector.rs | 204 ++++----- src/core/executor.rs | 104 +++++ src/core/index.rs | 37 +- src/core/index_meta.rs | 4 +- src/core/mod.rs | 2 + src/core/searcher.rs | 82 +++- src/core/segment_reader.rs | 28 +- src/error.rs | 25 +- src/fastfield/bytes/mod.rs | 4 +- src/fastfield/mod.rs | 8 +- src/fastfield/multivalued/mod.rs | 6 +- src/fastfield/multivalued/reader.rs | 4 +- src/fastfield/reader.rs | 4 +- src/fieldnorm/mod.rs | 2 +- src/functional_test.rs | 2 +- src/indexer/index_writer.rs | 24 +- src/indexer/merger.rs | 128 +++--- src/indexer/segment_updater.rs | 22 +- src/lib.rs | 207 ++++------ src/macros.rs | 10 +- src/postings/mod.rs | 14 +- src/postings/segment_postings.rs | 6 +- src/query/all_query.rs | 4 +- src/query/automaton_weight.rs | 7 +- src/query/boolean_query/mod.rs | 17 +- src/query/fuzzy_query.rs | 36 +- src/query/mod.rs | 4 +- src/query/phrase_query/mod.rs | 73 ++-- src/query/query.rs | 42 +- src/query/query_parser/query_parser.rs | 8 +- src/query/range_query.rs | 38 +- src/query/regex_query.rs | 44 +- src/query/scorer.rs | 25 +- src/query/term_query/mod.rs | 38 +- src/query/term_query/term_query.rs | 26 +- src/query/weight.rs | 2 +- src/schema/document.rs | 2 +- src/schema/mod.rs | 14 +- src/schema/schema.rs | 26 +- src/schema/term.rs | 2 +- src/schema/text_options.rs | 2 +- src/snippet/mod.rs | 10 +- src/space_usage/mod.rs | 12 +- src/store/mod.rs | 4 +- src/termdict/mod.rs | 4 +- src/tokenizer/mod.rs | 10 +- 66 files changed, 2111 insertions(+), 1530 deletions(-) create mode 100644 examples/custom_collector.rs delete mode 100644 src/collector/chained_collector.rs create mode 100644 src/collector/tests.rs create mode 100644 src/core/executor.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index b0981bf8b..a7a1b16ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ +Tantivy 0.8.1 +===================== +*No change in the index format* +- API Breaking change in the collector API. (@jwolfe, @fulmicoton) +- Multithreaded search (@jwolfe, @fulmicoton) + + Tantivy 0.7.1 ===================== +*No change in the index format* - Bugfix: NGramTokenizer panics on non ascii chars - Added a space usage API diff --git a/Cargo.toml b/Cargo.toml index 4a3ddf246..df8474ce1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy" -version = "0.7.1" +version = "0.8.0-dev" authors = ["Paul Masurel "] license = "MIT" categories = ["database-implementations", "data-structures"] @@ -48,6 +48,7 @@ owned-read = "0.4" failure = "0.1" htmlescape = "0.3.1" fail = "0.2" +scoped-pool = "0.1" [target.'cfg(windows)'.dependencies] winapi = "0.2" diff --git a/examples/basic_search.rs b/examples/basic_search.rs index 00576be51..f96fedf12 100644 --- a/examples/basic_search.rs +++ b/examples/basic_search.rs @@ -16,10 +16,11 @@ extern crate tempdir; // Importing tantivy... #[macro_use] extern crate tantivy; -use tantivy::collector::TopCollector; +use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::*; use tantivy::Index; +use tempdir::TempDir; fn main() -> tantivy::Result<()> { // Let's create a temporary directory for the @@ -34,7 +35,7 @@ fn main() -> tantivy::Result<()> { // be indexed". // first we need to define a schema ... - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); // Our first field is title. // We want full-text search for it, and we also want @@ -212,15 +213,10 @@ fn main() -> tantivy::Result<()> { // // We are not interested in all of the documents but // only in the top 10. Keeping track of our top 10 best documents - // is the role of the TopCollector. - let mut top_collector = TopCollector::with_limit(10); + // is the role of the TopDocs. // We can now perform our query. - searcher.search(&*query, &mut top_collector)?; - - // Our top collector now contains the 10 - // most relevant doc ids... - let doc_addresses = top_collector.docs(); + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; // The actual documents still need to be // retrieved from Tantivy's store. @@ -229,7 +225,7 @@ fn main() -> tantivy::Result<()> { // the document returned will only contain // a title. - for doc_address in doc_addresses { + for (_score, doc_address) in top_docs { let retrieved_doc = searcher.doc(doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } @@ -237,4 +233,3 @@ fn main() -> tantivy::Result<()> { Ok(()) } -use tempdir::TempDir; diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs new file mode 100644 index 000000000..5af31ceba --- /dev/null +++ b/examples/custom_collector.rs @@ -0,0 +1,189 @@ +// # Custom collector example +// +// This example shows how you can implement your own +// collector. As an example, we will compute a collector +// that computes the standard deviation of a given fast field. +// +// Of course, you can have a look at the tantivy's built-in collectors +// such as the `CountCollector` for more examples. + +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::query::QueryParser; +use tantivy::schema::{FAST, TEXT, INT_INDEXED, Schema}; +use tantivy::Index; +use tantivy::collector::{Collector, SegmentCollector}; +use tantivy::SegmentReader; +use tantivy::schema::Field; +use tantivy::fastfield::FastFieldReader; + +#[derive(Default)] +struct Stats { + count: usize, + sum: f64, + squared_sum: f64 +} + +impl Stats { + + pub fn count(&self) -> usize { + self.count + } + + pub fn mean(&self) -> f64 { + self.sum / (self.count as f64) + } + + fn square_mean(&self) -> f64 { + self.squared_sum / (self.count as f64) + } + + pub fn standard_deviation(&self) -> f64 { + let mean = self.mean(); + (self.square_mean() - mean * mean).sqrt() + } + + fn non_zero_count(self) -> Option { + if self.count == 0 { + None + } else { + Some(self) + } + } +} + + + +struct StatsCollector { + field: Field +} + +impl StatsCollector { + fn with_field(field: Field) -> StatsCollector { + StatsCollector { field } + } +} + +impl Collector for StatsCollector { + // That's the type of our result. + // Our standard deviation will be a float. + type Fruit = Option; + + type Child = StatsSegmentCollector; + + fn for_segment(&self, _segment_local_id: u32, segment: &SegmentReader) -> tantivy::Result { + let fast_field_reader = segment.fast_field_reader(self.field)?; + Ok(StatsSegmentCollector { + fast_field_reader, + stats: Stats::default() + }) + } + + fn requires_scoring(&self) -> bool { + // this collector does not care about score. + false + } + + fn merge_fruits(&self, segment_stats: Vec>) -> tantivy::Result> { + let mut stats = Stats::default(); + for segment_stats_opt in segment_stats { + if let Some(segment_stats) = segment_stats_opt { + stats.count += segment_stats.count; + stats.sum += segment_stats.sum; + stats.squared_sum += segment_stats.squared_sum; + } + } + Ok(stats.non_zero_count()) + } +} + + +struct StatsSegmentCollector { + fast_field_reader: FastFieldReader, + stats: Stats, +} + +impl SegmentCollector for StatsSegmentCollector { + type Fruit = Option; + + fn collect(&mut self, doc: u32, _score: f32) { + let value = self.fast_field_reader.get(doc) as f64; + self.stats.count += 1; + self.stats.sum += value; + self.stats.squared_sum += value * value; + } + + fn harvest(self) -> ::Fruit { + self.stats.non_zero_count() + } +} + + +fn main() -> tantivy::Result<()> { + // # Defining the schema + // + // The Tantivy index requires a very strict schema. + // The schema declares which fields are in the index, + // and for each field, its type and "the way it should + // be indexed". + + // first we need to define a schema ... + let mut schema_builder = Schema::builder(); + + // We'll assume a fictional index containing + // products, and with a name, a description, and a price. + let product_name = schema_builder.add_text_field("name", TEXT); + let product_description = schema_builder.add_text_field("description", TEXT); + let price = schema_builder.add_u64_field("price", INT_INDEXED | FAST); + let schema = schema_builder.build(); + + // # Indexing documents + // + // Lets index a bunch of fake documents for the sake of + // this example. + let index = Index::create_in_ram(schema.clone()); + + let mut index_writer = index.writer(50_000_000)?; + index_writer.add_document(doc!( + product_name => "Super Broom 2000", + product_description => "While it is ok for short distance travel, this broom \ + was designed quiditch. It will up your game.", + price => 30_200u64 + )); + index_writer.add_document(doc!( + product_name => "Turbulobroom", + product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\ + You'll enjoy its sharp turns, and rapid acceleration", + price => 29_240u64 + )); + index_writer.add_document(doc!( + product_name => "Broomio", + product_description => "Great value for the price. This broom is a market favorite", + price => 21_240u64 + )); + index_writer.add_document(doc!( + product_name => "Whack a Mole", + product_description => "Prime quality bat.", + price => 5_200u64 + )); + index_writer.commit()?; + index.load_searchers()?; + + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]); + + // here we want to get a hit on the 'ken' in Frankenstein + let query = query_parser.parse_query("broom")?; + if let Some(stats) = searcher.search(&query, &StatsCollector::with_field(price))? { + println!("count: {}", stats.count()); + println!("mean: {}", stats.mean()); + println!("standard deviation: {}", stats.standard_deviation()); + } + + Ok(()) +} + diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs index 08236c0e5..8c4ce8e06 100644 --- a/examples/custom_tokenizer.rs +++ b/examples/custom_tokenizer.rs @@ -5,7 +5,7 @@ #[macro_use] extern crate tantivy; -use tantivy::collector::TopCollector; +use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::*; use tantivy::tokenizer::NgramTokenizer; @@ -20,7 +20,7 @@ fn main() -> tantivy::Result<()> { // be indexed". // first we need to define a schema ... - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); // Our first field is title. // In this example we want to use NGram searching @@ -104,11 +104,9 @@ fn main() -> tantivy::Result<()> { // here we want to get a hit on the 'ken' in Frankenstein let query = query_parser.parse_query("ken")?; - let mut top_collector = TopCollector::with_limit(10); - searcher.search(&*query, &mut top_collector)?; + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; - let doc_addresses = top_collector.docs(); - for doc_address in doc_addresses { + for (_, doc_address) in top_docs { let retrieved_doc = searcher.doc(doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } diff --git a/examples/deleting_updating_documents.rs b/examples/deleting_updating_documents.rs index afae85685..ed59fa8c8 100644 --- a/examples/deleting_updating_documents.rs +++ b/examples/deleting_updating_documents.rs @@ -10,7 +10,7 @@ // Importing tantivy... #[macro_use] extern crate tantivy; -use tantivy::collector::TopCollector; +use tantivy::collector::TopDocs; use tantivy::query::TermQuery; use tantivy::schema::*; use tantivy::Index; @@ -27,10 +27,9 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result tantivy::Result<()> { // // Check out the *basic_search* example if this makes // small sense to you. - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); // Tantivy does not really have a notion of primary id. // This may change in the future. diff --git a/examples/faceted_search.rs b/examples/faceted_search.rs index 24fd536e8..9d68f2a4e 100644 --- a/examples/faceted_search.rs +++ b/examples/faceted_search.rs @@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> { // Let's create a temporary directory for the // sake of this example let index_path = TempDir::new("tantivy_facet_example_dir")?; - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); schema_builder.add_text_field("name", TEXT | STORED); @@ -62,11 +62,10 @@ fn main() -> tantivy::Result<()> { let mut facet_collector = FacetCollector::for_field(tags); facet_collector.add_facet("/pools"); - searcher.search(&AllQuery, &mut facet_collector).unwrap(); + let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap(); - let counts = facet_collector.harvest(); // This lists all of the facet counts - let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect(); + let facets: Vec<(&Facet, u64)> = facet_counts.get("/pools").collect(); assert_eq!( facets, vec![ diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs index 0434f58c8..62513ea7a 100644 --- a/examples/iterating_docs_and_positions.rs +++ b/examples/iterating_docs_and_positions.rs @@ -18,7 +18,7 @@ use tantivy::{DocId, DocSet, Postings}; fn main() -> tantivy::Result<()> { // We first create a schema for the sake of the // example. Check the `basic_search` example for more information. - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); // For this example, we need to make sure to index positions for our title // field. `TEXT` precisely does this. diff --git a/examples/snippet.rs b/examples/snippet.rs index ecc9481ed..a91a44f08 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -10,7 +10,7 @@ extern crate tempdir; // Importing tantivy... #[macro_use] extern crate tantivy; -use tantivy::collector::TopCollector; +use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::*; use tantivy::Index; @@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> { let index_path = TempDir::new("tantivy_example_dir")?; // # Defining the schema - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field("title", TEXT | STORED); let body = schema_builder.add_text_field("body", TEXT | STORED); let schema = schema_builder.build(); @@ -54,15 +54,14 @@ fn main() -> tantivy::Result<()> { let query_parser = QueryParser::for_index(&index, vec![title, body]); let query = query_parser.parse_query("sycamore spring")?; - let mut top_collector = TopCollector::with_limit(10); - searcher.search(&*query, &mut top_collector)?; + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?; - let doc_addresses = top_collector.docs(); - for doc_address in doc_addresses { + for (score, doc_address) in top_docs { let doc = searcher.doc(doc_address)?; let snippet = snippet_generator.snippet_from_doc(&doc); + println!("Document score {}:", score); println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); println!("snippet: {}", snippet.to_html()); } diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 80e78ece2..f7318cd8b 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -15,7 +15,7 @@ extern crate tempdir; // Importing tantivy... #[macro_use] extern crate tantivy; -use tantivy::collector::TopCollector; +use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::*; use tantivy::tokenizer::*; @@ -23,7 +23,7 @@ use tantivy::Index; fn main() -> tantivy::Result<()> { // this example assumes you understand the content in `basic_search` - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); // This configures your custom options for how tantivy will // store and process your content in the index; The key @@ -105,15 +105,11 @@ fn main() -> tantivy::Result<()> { // stop words are applied on the query as well. // The following will be equivalent to `title:frankenstein` let query = query_parser.parse_query("title:\"the Frankenstein\"")?; + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; - let mut top_collector = TopCollector::with_limit(10); - - searcher.search(&*query, &mut top_collector)?; - - let doc_addresses = top_collector.docs(); - - for doc_address in doc_addresses { + for (score, doc_address) in top_docs { let retrieved_doc = searcher.doc(doc_address)?; + println!("\n==\nDocument score {}:", score); println!("{}", schema.to_json(&retrieved_doc)); } diff --git a/examples/working_with_json.rs b/examples/working_with_json.rs index 3c8e3c1ca..10c89f709 100644 --- a/examples/working_with_json.rs +++ b/examples/working_with_json.rs @@ -9,7 +9,7 @@ fn main() -> tantivy::Result<()> { // Check out the basic example if this is confusing to you. // // first we need to define a schema ... - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); schema_builder.add_text_field("title", TEXT | STORED); schema_builder.add_text_field("body", TEXT); schema_builder.add_u64_field("year", INT_INDEXED); diff --git a/src/collector/chained_collector.rs b/src/collector/chained_collector.rs deleted file mode 100644 index 7654adce0..000000000 --- a/src/collector/chained_collector.rs +++ /dev/null @@ -1,142 +0,0 @@ -use collector::Collector; -use DocId; -use Result; -use Score; -use SegmentLocalId; -use SegmentReader; - -/// Collector that does nothing. -/// This is used in the chain Collector and will hopefully -/// be optimized away by the compiler. -pub struct DoNothingCollector; -impl Collector for DoNothingCollector { - #[inline] - fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> { - Ok(()) - } - #[inline] - fn collect(&mut self, _doc: DocId, _score: Score) {} - #[inline] - fn requires_scoring(&self) -> bool { - false - } -} - -/// Zero-cost abstraction used to collect on multiple collectors. -/// This contraption is only usable if the type of your collectors -/// are known at compile time. -/// -/// ```rust -/// #[macro_use] -/// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT}; -/// use tantivy::{Index, Result}; -/// use tantivy::collector::{CountCollector, TopCollector, chain}; -/// use tantivy::query::QueryParser; -/// -/// # fn main() { example().unwrap(); } -/// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); -/// let title = schema_builder.add_text_field("title", TEXT); -/// let schema = schema_builder.build(); -/// let index = Index::create_in_ram(schema); -/// { -/// let mut index_writer = index.writer(3_000_000)?; -/// index_writer.add_document(doc!( -/// title => "The Name of the Wind", -/// )); -/// index_writer.add_document(doc!( -/// title => "The Diary of Muadib", -/// )); -/// index_writer.add_document(doc!( -/// title => "A Dairy Cow", -/// )); -/// index_writer.add_document(doc!( -/// title => "The Diary of a Young Girl", -/// )); -/// index_writer.commit().unwrap(); -/// } -/// -/// index.load_searchers()?; -/// let searcher = index.searcher(); -/// -/// { -/// let mut top_collector = TopCollector::with_limit(2); -/// let mut count_collector = CountCollector::default(); -/// { -/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector); -/// let query_parser = QueryParser::for_index(&index, vec![title]); -/// let query = query_parser.parse_query("diary")?; -/// searcher.search(&*query, &mut collectors).unwrap(); -/// } -/// assert_eq!(count_collector.count(), 2); -/// assert!(top_collector.at_capacity()); -/// } -/// -/// Ok(()) -/// } -/// ``` -pub struct ChainedCollector { - left: Left, - right: Right, -} - -impl ChainedCollector { - /// Adds a collector - pub fn push(self, new_collector: &mut C) -> ChainedCollector { - ChainedCollector { - left: self, - right: new_collector, - } - } -} - -impl Collector for ChainedCollector { - fn set_segment( - &mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader, - ) -> Result<()> { - self.left.set_segment(segment_local_id, segment)?; - self.right.set_segment(segment_local_id, segment)?; - Ok(()) - } - - fn collect(&mut self, doc: DocId, score: Score) { - self.left.collect(doc, score); - self.right.collect(doc, score); - } - - fn requires_scoring(&self) -> bool { - self.left.requires_scoring() || self.right.requires_scoring() - } -} - -/// Creates a `ChainedCollector` -pub fn chain() -> ChainedCollector { - ChainedCollector { - left: DoNothingCollector, - right: DoNothingCollector, - } -} - -#[cfg(test)] -mod tests { - - use super::*; - use collector::{Collector, CountCollector, TopCollector}; - - #[test] - fn test_chained_collector() { - let mut top_collector = TopCollector::with_limit(2); - let mut count_collector = CountCollector::default(); - { - let mut collectors = chain().push(&mut top_collector).push(&mut count_collector); - collectors.collect(1, 0.2); - collectors.collect(2, 0.1); - collectors.collect(3, 0.5); - } - assert_eq!(count_collector.count(), 3); - assert!(top_collector.at_capacity()); - } -} diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index d96262e10..0af8b2316 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -4,6 +4,7 @@ use Result; use Score; use SegmentLocalId; use SegmentReader; +use collector::SegmentCollector; /// `CountCollector` collector only counts how many /// documents match the query. @@ -11,14 +12,14 @@ use SegmentReader; /// ```rust /// #[macro_use] /// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT}; +/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::{Index, Result}; -/// use tantivy::collector::CountCollector; +/// use tantivy::collector::Count; /// use tantivy::query::QueryParser; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); +/// let mut schema_builder = Schema::builder(); /// let title = schema_builder.add_text_field("title", TEXT); /// let schema = schema_builder.build(); /// let index = Index::create_in_ram(schema); @@ -43,59 +44,92 @@ use SegmentReader; /// let searcher = index.searcher(); /// /// { -/// let mut count_collector = CountCollector::default(); /// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query = query_parser.parse_query("diary")?; -/// searcher.search(&*query, &mut count_collector).unwrap(); +/// let count = searcher.search(&query, &Count).unwrap(); /// -/// assert_eq!(count_collector.count(), 2); +/// assert_eq!(count, 2); /// } /// /// Ok(()) /// } /// ``` -#[derive(Default)] -pub struct CountCollector { - count: usize, -} +pub struct Count; -impl CountCollector { - /// Returns the count of documents that were - /// collected. - pub fn count(&self) -> usize { - self.count - } -} -impl Collector for CountCollector { - fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> { - Ok(()) - } +impl Collector for Count { - fn collect(&mut self, _: DocId, _: Score) { - self.count += 1; + type Fruit = usize; + + type Child = SegmentCountCollector; + + fn for_segment(&self, _: SegmentLocalId, _: &SegmentReader) -> Result { + Ok(SegmentCountCollector::default()) } fn requires_scoring(&self) -> bool { false } + + fn merge_fruits(&self, segment_counts: Vec) -> Result { + Ok(segment_counts.into_iter().sum()) + } } + +#[derive(Default)] +pub struct SegmentCountCollector { + count: usize, +} + + +impl SegmentCollector for SegmentCountCollector { + type Fruit = usize; + + fn collect(&mut self, _: DocId, _: Score) { + self.count += 1; + } + + fn harvest(self) -> usize { + self.count + } +} + + #[cfg(test)] mod tests { - - use collector::{Collector, CountCollector}; + use super::{Count, SegmentCountCollector}; + use collector::SegmentCollector; + use collector::Collector; #[test] - fn test_count_collector() { - let mut count_collector = CountCollector::default(); - assert_eq!(count_collector.count(), 0); - count_collector.collect(0u32, 1f32); - assert_eq!(count_collector.count(), 1); - assert_eq!(count_collector.count(), 1); - count_collector.collect(1u32, 1f32); - assert_eq!(count_collector.count(), 2); - assert!(!count_collector.requires_scoring()); + fn test_count_collect_does_not_requires_scoring() { + assert!(!Count.requires_scoring()); + } + + #[test] + fn test_segment_count_collector() { + { + let count_collector = SegmentCountCollector::default(); + assert_eq!(count_collector.harvest(), 0); + } + { + let mut count_collector = SegmentCountCollector::default(); + count_collector.collect(0u32, 1f32); + assert_eq!(count_collector.harvest(), 1); + } + { + let mut count_collector = SegmentCountCollector::default(); + count_collector.collect(0u32, 1f32); + assert_eq!(count_collector.harvest(), 1); + } + { + let mut count_collector = SegmentCountCollector::default(); + count_collector.collect(0u32, 1f32); + count_collector.collect(1u32, 1f32); + assert_eq!(count_collector.harvest(), 2); + } + } } diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 92fc1ea7f..9c8bc31b6 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -3,23 +3,20 @@ use docset::SkipResult; use fastfield::FacetReader; use schema::Facet; use schema::Field; -use std::cell::UnsafeCell; use std::collections::btree_map; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::BinaryHeap; use std::collections::Bound; use std::iter::Peekable; -use std::mem; use std::{u64, usize}; -use termdict::TermMerger; - use std::cmp::Ordering; use DocId; use Result; use Score; use SegmentLocalId; use SegmentReader; +use collector::SegmentCollector; struct Hit<'a> { count: u64, @@ -46,12 +43,6 @@ impl<'a> Ord for Hit<'a> { } } -struct SegmentFacetCounter { - pub facet_reader: FacetReader, - pub facet_ords: Vec, - pub facet_counts: Vec, -} - fn facet_depth(facet_bytes: &[u8]) -> usize { if facet_bytes.is_empty() { 0 @@ -91,14 +82,14 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// ```rust /// #[macro_use] /// extern crate tantivy; -/// use tantivy::schema::{Facet, SchemaBuilder, TEXT}; +/// use tantivy::schema::{Facet, Schema, TEXT}; /// use tantivy::{Index, Result}; /// use tantivy::collector::FacetCollector; /// use tantivy::query::AllQuery; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); +/// let mut schema_builder = Schema::builder(); /// /// // Facet have their own specific type. /// // It is not a bad practise to put all of your @@ -141,13 +132,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// let mut facet_collector = FacetCollector::for_field(facet); /// facet_collector.add_facet("/lang"); /// facet_collector.add_facet("/category"); -/// searcher.search(&AllQuery, &mut facet_collector).unwrap(); -/// -/// // this object contains count aggregate for all of the facets. -/// let counts = facet_collector.harvest(); +/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap(); /// /// // This lists all of the facet counts -/// let facets: Vec<(&Facet, u64)> = counts +/// let facets: Vec<(&Facet, u64)> = facet_counts /// .get("/category") /// .collect(); /// assert_eq!(facets, vec![ @@ -159,13 +147,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// { /// let mut facet_collector = FacetCollector::for_field(facet); /// facet_collector.add_facet("/category/fiction"); -/// searcher.search(&AllQuery, &mut facet_collector).unwrap(); -/// -/// // this object contains count aggregate for all of the facets. -/// let counts = facet_collector.harvest(); +/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap(); /// /// // This lists all of the facet counts -/// let facets: Vec<(&Facet, u64)> = counts +/// let facets: Vec<(&Facet, u64)> = facet_counts /// .get("/category/fiction") /// .collect(); /// assert_eq!(facets, vec![ @@ -178,13 +163,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// { /// let mut facet_collector = FacetCollector::for_field(facet); /// facet_collector.add_facet("/category/fiction"); -/// searcher.search(&AllQuery, &mut facet_collector).unwrap(); -/// -/// // this object contains count aggregate for all of the facets. -/// let counts = facet_collector.harvest(); +/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap(); /// /// // This lists all of the facet counts -/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1); +/// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1); /// assert_eq!(facets, vec![ /// (&Facet::from("/category/fiction/fantasy"), 2) /// ]); @@ -194,21 +176,21 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// } /// ``` pub struct FacetCollector { - facet_ords: Vec, field: Field, - ff_reader: Option>, - segment_counters: Vec, - - // facet_ord -> collapse facet_id - current_segment_collapse_mapping: Vec, - // collapse facet_id -> count - current_segment_counts: Vec, - // collapse facet_id -> facet_ord - current_collapse_facet_ords: Vec, - facets: BTreeSet, } +pub struct FacetSegmentCollector { + reader: FacetReader, + facet_ords_buf: Vec, + // facet_ord -> collapse facet_id + collapse_mapping: Vec, + // collapse facet_id -> count + counts: Vec, + // collapse facet_id -> facet_ord + collapse_facet_ords: Vec, +} + fn skip<'a, I: Iterator>( target: &[u8], collapse_it: &mut Peekable, @@ -240,15 +222,8 @@ impl FacetCollector { /// is of the proper type. pub fn for_field(field: Field) -> FacetCollector { FacetCollector { - facet_ords: Vec::with_capacity(255), - segment_counters: Vec::new(), field, - ff_reader: None, - facets: BTreeSet::new(), - - current_segment_collapse_mapping: Vec::new(), - current_collapse_facet_ords: Vec::new(), - current_segment_counts: Vec::new(), + facets: BTreeSet::default() } } @@ -278,60 +253,104 @@ impl FacetCollector { } self.facets.insert(facet); } +} + +impl Collector for FacetCollector { + type Fruit = FacetCounts; + + type Child = FacetSegmentCollector; + + fn for_segment(&self, _: SegmentLocalId, reader: &SegmentReader) -> Result { + let facet_reader = reader.facet_reader(self.field)?; + + let mut collapse_mapping = Vec::new(); + let mut counts = Vec::new(); + let mut collapse_facet_ords = Vec::new(); - fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) { - self.current_segment_collapse_mapping.clear(); - self.current_collapse_facet_ords.clear(); - self.current_segment_counts.clear(); let mut collapse_facet_it = self.facets.iter().peekable(); - self.current_collapse_facet_ords.push(0); - let mut facet_streamer = facet_reader.facet_dict().range().into_stream(); - if !facet_streamer.advance() { - return; - } - 'outer: loop { - // at the begining of this loop, facet_streamer - // is positionned on a term that has not been processed yet. - let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it); - match skip_result { - SkipResult::Reached => { - // we reach a facet we decided to collapse. - let collapse_depth = facet_depth(facet_streamer.key()); - let mut collapsed_id = 0; - self.current_segment_collapse_mapping.push(0); - while facet_streamer.advance() { - let depth = facet_depth(facet_streamer.key()); - if depth <= collapse_depth { - continue 'outer; + collapse_facet_ords.push(0); + { + let mut facet_streamer = facet_reader.facet_dict().range().into_stream(); + if facet_streamer.advance() { + 'outer: loop { + // at the begining of this loop, facet_streamer + // is positionned on a term that has not been processed yet. + let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it); + match skip_result { + SkipResult::Reached => { + // we reach a facet we decided to collapse. + let collapse_depth = facet_depth(facet_streamer.key()); + let mut collapsed_id = 0; + collapse_mapping.push(0); + while facet_streamer.advance() { + let depth = facet_depth(facet_streamer.key()); + if depth <= collapse_depth { + continue 'outer; + } + if depth == collapse_depth + 1 { + collapsed_id = collapse_facet_ords.len(); + collapse_facet_ords.push(facet_streamer.term_ord()); + collapse_mapping.push(collapsed_id); + } else { + collapse_mapping.push(collapsed_id); + } + } + break; } - if depth == collapse_depth + 1 { - collapsed_id = self.current_collapse_facet_ords.len(); - self.current_collapse_facet_ords - .push(facet_streamer.term_ord()); - self.current_segment_collapse_mapping.push(collapsed_id); - } else { - self.current_segment_collapse_mapping.push(collapsed_id); + SkipResult::End | SkipResult::OverStep => { + collapse_mapping.push(0); + if !facet_streamer.advance() { + break; + } } } - break; - } - SkipResult::End | SkipResult::OverStep => { - self.current_segment_collapse_mapping.push(0); - if !facet_streamer.advance() { - break; - } } } } + + counts.resize(collapse_facet_ords.len(), 0); + + Ok(FacetSegmentCollector { + reader: facet_reader, + facet_ords_buf: Vec::with_capacity(255), + collapse_mapping, + counts, + collapse_facet_ords, + }) } - fn finalize_segment(&mut self) { - if self.ff_reader.is_some() { - self.segment_counters.push(SegmentFacetCounter { - facet_reader: self.ff_reader.take().unwrap().into_inner(), - facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()), - facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()), - }); + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, segments_facet_counts: Vec) -> Result { + let mut facet_counts: BTreeMap = BTreeMap::new(); + for segment_facet_counts in segments_facet_counts { + for (facet, count) in segment_facet_counts.facet_counts { + *(facet_counts.entry(facet).or_insert(0)) += count; + } + } + Ok(FacetCounts { facet_counts }) + } +} + +impl SegmentCollector for FacetSegmentCollector { + + type Fruit = FacetCounts; + + + fn collect(&mut self, doc: DocId, _: Score) { + self.reader.facet_ords(doc, &mut self.facet_ords_buf); + let mut previous_collapsed_ord: usize = usize::MAX; + for &facet_ord in &self.facet_ords_buf { + let collapsed_ord = self.collapse_mapping[facet_ord as usize]; + self.counts[collapsed_ord] += + if collapsed_ord == previous_collapsed_ord { + 0 + } else { + 1 + }; + previous_collapsed_ord = collapsed_ord; } } @@ -339,95 +358,22 @@ impl FacetCollector { /// /// This method does not just return the counters, /// it also translates the facet ordinals of the last segment. - pub fn harvest(mut self) -> FacetCounts { - self.finalize_segment(); - - let collapsed_facet_ords: Vec<&[u64]> = self - .segment_counters - .iter() - .map(|segment_counter| &segment_counter.facet_ords[..]) - .collect(); - let collapsed_facet_counts: Vec<&[u64]> = self - .segment_counters - .iter() - .map(|segment_counter| &segment_counter.facet_counts[..]) - .collect(); - - let facet_streams = self - .segment_counters - .iter() - .map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream()) - .collect::>(); - - let mut facet_merger = TermMerger::new(facet_streams); + fn harvest(self) -> FacetCounts { let mut facet_counts = BTreeMap::new(); - - while facet_merger.advance() { - let count = facet_merger - .current_kvs() - .iter() - .map(|it| { - let seg_ord = it.segment_ord; - let term_ord = it.streamer.term_ord(); - collapsed_facet_ords[seg_ord] - .binary_search(&term_ord) - .map(|collapsed_term_id| { - if collapsed_term_id == 0 { - 0 - } else { - collapsed_facet_counts[seg_ord][collapsed_term_id] - } - }).unwrap_or(0) - }).sum(); - if count > 0u64 { - let bytes: Vec = facet_merger.key().to_owned(); - // may create an corrupted facet if the term dicitonary is corrupted - let facet = unsafe { Facet::from_encoded(bytes) }; - facet_counts.insert(facet, count); + let facet_dict = self.reader.facet_dict(); + for (collapsed_facet_ord, count) in self.counts.iter().cloned().enumerate() { + if count == 0 { + continue; } + let mut facet = vec![]; + let facet_ord = self.collapse_facet_ords[collapsed_facet_ord]; + facet_dict.ord_to_term(facet_ord as u64, &mut facet); + facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count); } FacetCounts { facet_counts } } } -impl Collector for FacetCollector { - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { - self.finalize_segment(); - let facet_reader = reader.facet_reader(self.field)?; - self.set_collapse_mapping(&facet_reader); - self.current_segment_counts - .resize(self.current_collapse_facet_ords.len(), 0); - self.ff_reader = Some(UnsafeCell::new(facet_reader)); - Ok(()) - } - - fn collect(&mut self, doc: DocId, _: Score) { - let facet_reader: &mut FacetReader = unsafe { - &mut *self - .ff_reader - .as_ref() - .expect("collect() was called before set_segment. This should never happen.") - .get() - }; - facet_reader.facet_ords(doc, &mut self.facet_ords); - let mut previous_collapsed_ord: usize = usize::MAX; - for &facet_ord in &self.facet_ords { - let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize]; - self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord - { - 0 - } else { - 1 - }; - previous_collapsed_ord = collapsed_ord; - } - } - - fn requires_scoring(&self) -> bool { - false - } -} - /// Intermediary result of the `FacetCollector` that stores /// the facet counts for all the segments. pub struct FacetCounts { @@ -506,14 +452,13 @@ mod tests { use query::AllQuery; use rand::distributions::Uniform; use rand::{thread_rng, Rng}; - use schema::Field; + use schema::{Field, Document, Facet, Schema}; use rand::prelude::SliceRandom; - use schema::{Document, Facet, SchemaBuilder}; use std::iter; #[test] fn test_facet_collector_drilldown() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let facet_field = schema_builder.add_facet_field("facet"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -537,12 +482,10 @@ mod tests { index_writer.commit().unwrap(); index.load_searchers().unwrap(); let searcher = index.searcher(); - - let mut facet_collector = FacetCollector::for_field(facet_field); + let mut facet_collector= FacetCollector::for_field(facet_field); facet_collector.add_facet(Facet::from("/top1")); - searcher.search(&AllQuery, &mut facet_collector).unwrap(); + let counts = searcher.search(&AllQuery, &facet_collector).unwrap(); - let counts: FacetCounts = facet_collector.harvest(); { let facets: Vec<(String, u64)> = counts .get("/top1") @@ -576,7 +519,7 @@ mod tests { #[test] fn test_doc_unsorted_multifacet() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let facet_field = schema_builder.add_facet_field("facets"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -593,8 +536,7 @@ mod tests { assert_eq!(searcher.num_docs(), 1); let mut facet_collector = FacetCollector::for_field(facet_field); facet_collector.add_facet("/subjects"); - searcher.search(&AllQuery, &mut facet_collector).unwrap(); - let counts = facet_collector.harvest(); + let counts = searcher.search(&AllQuery, &facet_collector).unwrap(); let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect(); assert_eq!(facets[0].1, 1); } @@ -608,7 +550,7 @@ mod tests { #[test] fn test_facet_collector_topk() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let facet_field = schema_builder.add_facet_field("facet"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -640,9 +582,8 @@ mod tests { let mut facet_collector = FacetCollector::for_field(facet_field); facet_collector.add_facet("/facet"); - searcher.search(&AllQuery, &mut facet_collector).unwrap(); + let counts: FacetCounts = searcher.search(&AllQuery, &facet_collector).unwrap(); - let counts: FacetCounts = facet_collector.harvest(); { let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3); assert_eq!( @@ -665,13 +606,13 @@ mod bench { use query::AllQuery; use rand::{thread_rng, Rng}; use schema::Facet; - use schema::SchemaBuilder; + use schema::Schema; use test::Bencher; use Index; #[bench] fn bench_facet_collector(b: &mut Bencher) { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let facet_field = schema_builder.add_facet_field("facet"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -695,8 +636,8 @@ mod bench { b.iter(|| { let searcher = index.searcher(); - let mut facet_collector = FacetCollector::for_field(facet_field); - searcher.search(&AllQuery, &mut facet_collector).unwrap(); + let facet_collector = FacetCollector::for_field(facet_field); + searcher.search(&AllQuery, &facet_collector).unwrap(); }); } } diff --git a/src/collector/int_facet_collector.rs b/src/collector/int_facet_collector.rs index 72cfd711d..ac53b9908 100644 --- a/src/collector/int_facet_collector.rs +++ b/src/collector/int_facet_collector.rs @@ -79,7 +79,7 @@ mod tests { // make sure we have facet counters correctly filled fn test_facet_collector_results() { - let mut schema_builder = schema::SchemaBuilder::new(); + let mut schema_builder = schema::Schema::builder(); let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST); let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST); let text_field = schema_builder.add_text_field("text", STRING); diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 99af0f286..8db4da52f 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -1,5 +1,88 @@ /*! -Defines how the documents matching a search query should be processed. + +# Collectors + +Collectors define the information you want to extract from the documents matching the queries. +In tantivy jargon, we call this information your search "fruit". + +Your fruit could for instance be : +- [the count of matching documents](./struct.Count.html) +- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html) +- [facet counts](./struct.FacetCollector.html) + +At one point in your code, you will trigger the actual search operation by calling +[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search). +This call will look like this. + +```verbatim +let fruit = searcher.search(&query, &collector)?; +``` + +Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`). + + +# Combining several collectors + +A rich search experience often requires to run several collectors on your search query. +For instance, +- selecting the top-K products matching your query +- counting the matching documents +- computing several facets +- computing statistics about the matching product prices + +A simple and efficient way to do that is to pass your collectors as one tuple. +The resulting `Fruit` will then be a typed tuple with each collector's original fruits +in their respective position. + +```rust +# extern crate tantivy; +# use tantivy::schema::*; +# use tantivy::*; +# use tantivy::query::*; +use tantivy::collector::{Count, TopDocs}; +# +# fn main() -> tantivy::Result<()> { +# let mut schema_builder = Schema::builder(); +# let title = schema_builder.add_text_field("title", TEXT); +# let schema = schema_builder.build(); +# let index = Index::create_in_ram(schema); +# let mut index_writer = index.writer(3_000_000)?; +# index_writer.add_document(doc!( +# title => "The Name of the Wind", +# )); +# index_writer.add_document(doc!( +# title => "The Diary of Muadib", +# )); +# index_writer.commit().unwrap(); +# index.load_searchers()?; +# let searcher = index.searcher(); +# let query_parser = QueryParser::for_index(&index, vec![title]); +# let query = query_parser.parse_query("diary")?; +let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) = + searcher.search(&query, &(Count, TopDocs::with_limit(2)))?; +# Ok(()) +# } +``` + +The `Collector` trait is implemented for up to 4 collectors. +If you have more than 4 collectors, you can either group them into +tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s. + +# Combining several collectors dynamically + +Combining collectors into a tuple is a zero-cost abstraction: everything +happens as if you had manually implemented a single collector +combining all of our features. + +Unfortunately it requires you to know at compile time your collector types. +If on the other hand, the collectors depend on some query parameter, +you can rely on `MultiCollector`'s. + + +# Implementing your own collectors. + +See the `custom_collector` example. + */ use DocId; @@ -7,9 +90,10 @@ use Result; use Score; use SegmentLocalId; use SegmentReader; +use downcast; mod count_collector; -pub use self::count_collector::CountCollector; +pub use self::count_collector::Count; mod multi_collector; pub use self::multi_collector::MultiCollector; @@ -17,237 +101,258 @@ pub use self::multi_collector::MultiCollector; mod top_collector; mod top_score_collector; -pub use self::top_score_collector::TopScoreCollector; -#[deprecated] -pub use self::top_score_collector::TopScoreCollector as TopCollector; +pub use self::top_score_collector::TopDocs; mod top_field_collector; -pub use self::top_field_collector::TopFieldCollector; +pub use self::top_field_collector::TopDocsByField; mod facet_collector; pub use self::facet_collector::FacetCollector; -mod chained_collector; -pub use self::chained_collector::{chain, ChainedCollector}; +/// `Fruit` is the type for the result of our collection. +/// e.g. `usize` for the `Count` collector. +pub trait Fruit: Send + downcast::Any {} + +impl Fruit for T where T: Send + downcast::Any {} /// Collectors are in charge of collecting and retaining relevant /// information from the document found and scored by the query. /// -/// /// For instance, /// /// - keeping track of the top 10 best documents /// - computing a breakdown over a fast field /// - computing the number of documents matching the query /// -/// Queries are in charge of pushing the `DocSet` to the collector. +/// Our search index is in fact a collection of segments, so +/// a `Collector` trait is actually more of a factory to instance +/// `SegmentCollector`s for each segments. /// -/// As they work on multiple segments, they first inform -/// the collector of a change in a segment and then -/// call the `collect` method to push the document to the collector. -/// -/// Temporally, our collector will receive calls -/// - `.set_segment(0, segment_reader_0)` -/// - `.collect(doc0_of_segment_0)` -/// - `.collect(...)` -/// - `.collect(last_doc_of_segment_0)` -/// - `.set_segment(1, segment_reader_1)` -/// - `.collect(doc0_of_segment_1)` -/// - `.collect(...)` -/// - `.collect(last_doc_of_segment_1)` -/// - `...` -/// - `.collect(last_doc_of_last_segment)` +/// The collection logic itself is in the `SegmentCollector`. /// /// Segments are not guaranteed to be visited in any specific order. -pub trait Collector { +pub trait Collector: Sync { + + /// `Fruit` is the type for the result of our collection. + /// e.g. `usize` for the `Count` collector. + type Fruit: Fruit; + + /// Type of the `SegmentCollector` associated to this collector. + type Child: SegmentCollector; + /// `set_segment` is called before beginning to enumerate /// on this segment. - fn set_segment( - &mut self, + fn for_segment( + &self, segment_local_id: SegmentLocalId, segment: &SegmentReader, - ) -> Result<()>; - /// The query pushes the scored document to the collector via this method. - fn collect(&mut self, doc: DocId, score: Score); + ) -> Result; /// Returns true iff the collector requires to compute scores for documents. fn requires_scoring(&self) -> bool; + + /// Combines the fruit associated to the collection of each segments + /// into one fruit. + fn merge_fruits(&self, segment_fruits: Vec) -> Result; } -impl<'a, C: Collector> Collector for &'a mut C { - fn set_segment( - &mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader, - ) -> Result<()> { - (*self).set_segment(segment_local_id, segment) - } + +/// The `SegmentCollector` is the trait in charge of defining the +/// collect operation at the scale of the segment. +/// +/// `.collect(doc, score)` will be called for every documents +/// matching the query. +pub trait SegmentCollector: 'static { + /// `Fruit` is the type for the result of our collection. + /// e.g. `usize` for the `Count` collector. + type Fruit: Fruit; + /// The query pushes the scored document to the collector via this method. - fn collect(&mut self, doc: DocId, score: Score) { - C::collect(self, doc, score) + fn collect(&mut self, doc: DocId, score: Score); + + /// Extract the fruit of the collection from the `SegmentCollector`. + fn harvest(self) -> Self::Fruit; +} + +// ----------------------------------------------- +// Tuple implementations. + + +impl Collector for (Left, Right) +where + Left: Collector, + Right: Collector +{ + type Fruit = (Left::Fruit, Right::Fruit); + type Child = (Left::Child, Right::Child); + + fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result { + let left = self.0.for_segment(segment_local_id, segment)?; + let right = self.1.for_segment(segment_local_id, segment)?; + Ok((left, right)) } fn requires_scoring(&self) -> bool { - C::requires_scoring(self) + self.0.requires_scoring() || self.1.requires_scoring() + } + + fn merge_fruits(&self, children: Vec<(Left::Fruit, Right::Fruit)>) -> Result<(Left::Fruit, Right::Fruit)> { + let mut left_fruits = vec![]; + let mut right_fruits = vec![]; + for (left_fruit, right_fruit) in children { + left_fruits.push(left_fruit); + right_fruits.push(right_fruit); + } + Ok((self.0.merge_fruits(left_fruits)?, + self.1.merge_fruits(right_fruits)?)) } } +impl SegmentCollector for (Left, Right) + where + Left: SegmentCollector, + Right: SegmentCollector +{ + type Fruit = (Left::Fruit, Right::Fruit); + + fn collect(&mut self, doc: DocId, score: Score) { + self.0.collect(doc, score); + self.1.collect(doc, score); + } + + fn harvest(self) -> ::Fruit { + (self.0.harvest(), self.1.harvest()) + } +} + +// 3-Tuple + +impl Collector for (One, Two, Three) + where One: Collector, + Two: Collector, + Three: Collector +{ + type Fruit = (One::Fruit, Two::Fruit, Three::Fruit); + type Child = (One::Child, Two::Child, Three::Child); + + fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result { + let one = self.0.for_segment(segment_local_id, segment)?; + let two = self.1.for_segment(segment_local_id, segment)?; + let three = self.2.for_segment(segment_local_id, segment)?; + Ok((one, two, three)) + } + + fn requires_scoring(&self) -> bool { + self.0.requires_scoring() || + self.1.requires_scoring() || + self.2.requires_scoring() + } + + fn merge_fruits(&self, children: Vec) -> Result { + let mut one_fruits = vec![]; + let mut two_fruits = vec![]; + let mut three_fruits = vec![]; + for (one_fruit, two_fruit, three_fruit) in children { + one_fruits.push(one_fruit); + two_fruits.push(two_fruit); + three_fruits.push(three_fruit); + } + Ok((self.0.merge_fruits(one_fruits)?, + self.1.merge_fruits(two_fruits)?, + self.2.merge_fruits(three_fruits)?)) + } +} + +impl SegmentCollector for (One, Two, Three) + where + One: SegmentCollector, + Two: SegmentCollector, + Three: SegmentCollector +{ + type Fruit = (One::Fruit, Two::Fruit, Three::Fruit); + + fn collect(&mut self, doc: DocId, score: Score) { + self.0.collect(doc, score); + self.1.collect(doc, score); + self.2.collect(doc, score); + } + + fn harvest(self) -> ::Fruit { + (self.0.harvest(), self.1.harvest(), self.2.harvest()) + } +} + + +// 4-Tuple + +impl Collector for (One, Two, Three, Four) + where One: Collector, + Two: Collector, + Three: Collector, + Four: Collector +{ + type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit); + type Child = (One::Child, Two::Child, Three::Child, Four::Child); + + fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result { + let one = self.0.for_segment(segment_local_id, segment)?; + let two = self.1.for_segment(segment_local_id, segment)?; + let three = self.2.for_segment(segment_local_id, segment)?; + let four = self.3.for_segment(segment_local_id, segment)?; + Ok((one, two, three, four)) + } + + fn requires_scoring(&self) -> bool { + self.0.requires_scoring() || + self.1.requires_scoring() || + self.2.requires_scoring() || + self.3.requires_scoring() + } + + fn merge_fruits(&self, children: Vec) -> Result { + let mut one_fruits = vec![]; + let mut two_fruits = vec![]; + let mut three_fruits = vec![]; + let mut four_fruits = vec![]; + for (one_fruit, two_fruit, three_fruit, four_fruit) in children { + one_fruits.push(one_fruit); + two_fruits.push(two_fruit); + three_fruits.push(three_fruit); + four_fruits.push(four_fruit); + } + Ok((self.0.merge_fruits(one_fruits)?, + self.1.merge_fruits(two_fruits)?, + self.2.merge_fruits(three_fruits)?, + self.3.merge_fruits(four_fruits)?)) + } +} + +impl SegmentCollector for (One, Two, Three, Four) + where + One: SegmentCollector, + Two: SegmentCollector, + Three: SegmentCollector, + Four: SegmentCollector +{ + type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit); + + fn collect(&mut self, doc: DocId, score: Score) { + self.0.collect(doc, score); + self.1.collect(doc, score); + self.2.collect(doc, score); + self.3.collect(doc, score); + } + + fn harvest(self) -> ::Fruit { + (self.0.harvest(), self.1.harvest(), self.2.harvest(), self.3.harvest()) + } +} + +#[allow(missing_docs)] +mod downcast_impl { + downcast!(super::Fruit); +} + + #[cfg(test)] -pub mod tests { - - use super::*; - use core::SegmentReader; - use fastfield::BytesFastFieldReader; - use fastfield::FastFieldReader; - use schema::Field; - use DocId; - use Score; - use SegmentLocalId; - - /// Stores all of the doc ids. - /// This collector is only used for tests. - /// It is unusable in practise, as it does not store - /// the segment ordinals - pub struct TestCollector { - offset: DocId, - segment_max_doc: DocId, - docs: Vec, - scores: Vec, - } - - impl TestCollector { - /// Return the exhalist of documents. - pub fn docs(self) -> Vec { - self.docs - } - - pub fn scores(self) -> Vec { - self.scores - } - } - - impl Default for TestCollector { - fn default() -> TestCollector { - TestCollector { - offset: 0, - segment_max_doc: 0, - docs: Vec::new(), - scores: Vec::new(), - } - } - } - - impl Collector for TestCollector { - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { - self.offset += self.segment_max_doc; - self.segment_max_doc = reader.max_doc(); - Ok(()) - } - - fn collect(&mut self, doc: DocId, score: Score) { - self.docs.push(doc + self.offset); - self.scores.push(score); - } - - fn requires_scoring(&self) -> bool { - true - } - } - - /// Collects in order all of the fast fields for all of the - /// doc in the `DocSet` - /// - /// This collector is mainly useful for tests. - pub struct FastFieldTestCollector { - vals: Vec, - field: Field, - ff_reader: Option>, - } - - impl FastFieldTestCollector { - pub fn for_field(field: Field) -> FastFieldTestCollector { - FastFieldTestCollector { - vals: Vec::new(), - field, - ff_reader: None, - } - } - - pub fn vals(self) -> Vec { - self.vals - } - } - - impl Collector for FastFieldTestCollector { - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { - self.ff_reader = Some(reader.fast_field_reader(self.field)?); - Ok(()) - } - - fn collect(&mut self, doc: DocId, _score: Score) { - let val = self.ff_reader.as_ref().unwrap().get(doc); - self.vals.push(val); - } - fn requires_scoring(&self) -> bool { - false - } - } - - /// Collects in order all of the fast field bytes for all of the - /// docs in the `DocSet` - /// - /// This collector is mainly useful for tests. - pub struct BytesFastFieldTestCollector { - vals: Vec, - field: Field, - ff_reader: Option, - } - - impl BytesFastFieldTestCollector { - pub fn for_field(field: Field) -> BytesFastFieldTestCollector { - BytesFastFieldTestCollector { - vals: Vec::new(), - field, - ff_reader: None, - } - } - - pub fn vals(self) -> Vec { - self.vals - } - } - - impl Collector for BytesFastFieldTestCollector { - fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> { - self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?); - Ok(()) - } - - fn collect(&mut self, doc: u32, _score: f32) { - let val = self.ff_reader.as_ref().unwrap().get_val(doc); - self.vals.extend(val); - } - - fn requires_scoring(&self) -> bool { - false - } - } -} - -#[cfg(all(test, feature = "unstable"))] -mod bench { - use collector::{Collector, CountCollector}; - use test::Bencher; - - #[bench] - fn build_collector(b: &mut Bencher) { - b.iter(|| { - let mut count_collector = CountCollector::default(); - let docs: Vec = (0..1_000_000).collect(); - for doc in docs { - count_collector.collect(doc, 1f32); - } - count_collector.count() - }); - } -} +pub mod tests; diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 14ff80788..24f951ef4 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -1,9 +1,101 @@ use super::Collector; +use super::SegmentCollector; use DocId; -use Result; use Score; +use Result; use SegmentLocalId; use SegmentReader; +use downcast::Downcast; +use collector::Fruit; +use std::marker::PhantomData; +use TantivyError; + + +pub struct MultiFruit { + sub_fruits: Vec>> +} + + +pub struct CollectorWrapper(TCollector); + +impl Collector for CollectorWrapper { + type Fruit = Box; + type Child = Box; + + fn for_segment(&self, segment_local_id: u32, reader: &SegmentReader) -> Result> { + let child = self.0.for_segment(segment_local_id, reader)?; + Ok(Box::new(SegmentCollectorWrapper(child))) + } + + fn requires_scoring(&self) -> bool { + self.0.requires_scoring() + } + + fn merge_fruits(&self, children: Vec<::Fruit>) -> Result> { + let typed_fruit: Vec = children.into_iter() + .map(|untyped_fruit| { + Downcast::::downcast(untyped_fruit) + .map(|boxed_but_typed| *boxed_but_typed) + .map_err(|e| { + let err_msg = format!("Failed to cast child collector fruit. {:?}", e); + TantivyError::InvalidArgument(err_msg) + }) + }) + .collect::>()?; + let merged_fruit = self.0.merge_fruits(typed_fruit)?; + Ok(Box::new(merged_fruit)) + } +} + + +impl SegmentCollector for Box { + + type Fruit = Box; + + fn collect(&mut self, doc: u32, score: f32) { + self.as_mut().collect(doc, score); + } + + fn harvest(self) -> Box { + BoxableSegmentCollector::harvest_from_box(self) + } +} + +pub trait BoxableSegmentCollector { + fn collect(&mut self, doc: u32, score: f32); + fn harvest_from_box(self: Box) -> Box; +} + + + +pub struct SegmentCollectorWrapper(TSegmentCollector); + + +impl BoxableSegmentCollector for SegmentCollectorWrapper { + + fn collect(&mut self, doc: u32, score: f32) { + self.0.collect(doc, score); + } + + fn harvest_from_box(self: Box) -> Box { + Box::new(self.0.harvest()) + } +} + +pub struct FruitHandle { + pos: usize, + _phantom: PhantomData +} + +impl FruitHandle { + pub fn extract(self, fruits: &mut MultiFruit) -> TFruit { + let boxed_fruit = fruits.sub_fruits[self.pos] + .take() + .expect(""); + *Downcast::::downcast(boxed_fruit) + .expect("Failed") + } +} /// Multicollector makes it possible to collect on more than one collector. /// It should only be used for use cases where the Collector types is unknown @@ -13,14 +105,14 @@ use SegmentReader; /// ```rust /// #[macro_use] /// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT}; +/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::{Index, Result}; -/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector}; +/// use tantivy::collector::{Count, TopDocs, MultiCollector}; /// use tantivy::query::QueryParser; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); +/// let mut schema_builder = Schema::builder(); /// let title = schema_builder.add_text_field("title", TEXT); /// let schema = schema_builder.build(); /// let index = Index::create_in_ram(schema); @@ -44,76 +136,159 @@ use SegmentReader; /// index.load_searchers()?; /// let searcher = index.searcher(); /// -/// { -/// let mut top_collector = TopCollector::with_limit(2); -/// let mut count_collector = CountCollector::default(); -/// { -/// let mut collectors = -/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]); -/// let query_parser = QueryParser::for_index(&index, vec![title]); -/// let query = query_parser.parse_query("diary")?; -/// searcher.search(&*query, &mut collectors).unwrap(); -/// } -/// assert_eq!(count_collector.count(), 2); -/// assert!(top_collector.at_capacity()); -/// } +/// let mut collectors = MultiCollector::new(); +/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2)); +/// let count_handle = collectors.add_collector(Count); +/// let query_parser = QueryParser::for_index(&index, vec![title]); +/// let query = query_parser.parse_query("diary")?; +/// let mut multi_fruit = searcher.search(&query, &collectors)?; +/// +/// let count = count_handle.extract(&mut multi_fruit); +/// let top_docs = top_docs_handle.extract(&mut multi_fruit); +/// +/// # assert_eq!(count, 2); +/// # assert_eq!(top_docs.len(), 2); /// /// Ok(()) /// } /// ``` pub struct MultiCollector<'a> { - collectors: Vec<&'a mut Collector>, + collector_wrappers: Vec,Fruit=Box> + 'a>> } impl<'a> MultiCollector<'a> { - /// Constructor - pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector { - MultiCollector { collectors } + + /// Create a new `MultiCollector` + pub fn new() -> MultiCollector<'a> { + MultiCollector { + collector_wrappers: Vec::new() + } + } + + /// Add a new collector to our `MultiCollector`. + pub fn add_collector<'b: 'a, TCollector: Collector + 'b>(&mut self, collector: TCollector) -> FruitHandle { + let pos = self.collector_wrappers.len(); + self.collector_wrappers.push(Box::new(CollectorWrapper(collector))); + FruitHandle { + pos, + _phantom: PhantomData + } } } impl<'a> Collector for MultiCollector<'a> { - fn set_segment( - &mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader, - ) -> Result<()> { - for collector in &mut self.collectors { - collector.set_segment(segment_local_id, segment)?; - } - Ok(()) + type Fruit = MultiFruit; + type Child = MultiCollectorChild; + + fn for_segment(&self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result { + let children = self.collector_wrappers + .iter() + .map(|collector_wrapper| { + collector_wrapper.for_segment(segment_local_id, segment) + }) + .collect::>>()?; + Ok(MultiCollectorChild { + children + }) } + fn requires_scoring(&self) -> bool { + self.collector_wrappers + .iter() + .any(|c| c.requires_scoring()) + } + + fn merge_fruits(&self, segments_multifruits: Vec) + -> Result { + let mut segment_fruits_list: Vec>> = + (0..self.collector_wrappers.len()) + .map(|_| Vec::with_capacity(segments_multifruits.len())) + .collect::>(); + for segment_multifruit in segments_multifruits { + for (idx, segment_fruit_opt) in segment_multifruit.sub_fruits.into_iter().enumerate() { + if let Some(segment_fruit) = segment_fruit_opt { + segment_fruits_list[idx].push(segment_fruit); + } + } + } + let sub_fruits = self.collector_wrappers + .iter() + .zip(segment_fruits_list) + .map(|(child_collector, segment_fruits)| + Ok(Some(child_collector.merge_fruits(segment_fruits)?)) + ) + .collect::>()?; + Ok(MultiFruit { sub_fruits }) + } + +} + + +pub struct MultiCollectorChild { + children: Vec> +} + +impl SegmentCollector for MultiCollectorChild { + type Fruit = MultiFruit; + fn collect(&mut self, doc: DocId, score: Score) { - for collector in &mut self.collectors { - collector.collect(doc, score); + for child in &mut self.children { + child.collect(doc, score); } } - fn requires_scoring(&self) -> bool { - self.collectors - .iter() - .any(|collector| collector.requires_scoring()) + + fn harvest(self) -> MultiFruit { + MultiFruit { + sub_fruits: self.children + .into_iter() + .map(|child| Some(child.harvest()) ) + .collect() + } } } + #[cfg(test)] mod tests { use super::*; - use collector::{Collector, CountCollector, TopScoreCollector}; + use collector::{Count, TopDocs}; + use schema::{TEXT, Schema}; + use query::TermQuery; + use Index; + use Term; + use schema::IndexRecordOption; #[test] fn test_multi_collector() { - let mut top_collector = TopScoreCollector::with_limit(2); - let mut count_collector = CountCollector::default(); + let mut schema_builder = Schema::builder(); + let text = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); { - let mut collectors = - MultiCollector::from(vec![&mut top_collector, &mut count_collector]); - collectors.collect(1, 0.2); - collectors.collect(2, 0.1); - collectors.collect(3, 0.5); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(text=>"abc")); + index_writer.add_document(doc!(text=>"abc abc abc")); + index_writer.add_document(doc!(text=>"abc abc")); + index_writer.commit().unwrap(); + index_writer.add_document(doc!(text=>"")); + index_writer.add_document(doc!(text=>"abc abc abc abc")); + index_writer.add_document(doc!(text=>"abc")); + index_writer.commit().unwrap(); } - assert_eq!(count_collector.count(), 3); - assert!(top_collector.at_capacity()); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let term = Term::from_field_text(text, "abc"); + let query = TermQuery::new(term, IndexRecordOption::Basic); + + let mut collectors = MultiCollector::new(); + let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2)); + let count_handler = collectors.add_collector(Count); + let mut multifruits = searcher.search(&query, &mut collectors).unwrap(); + + assert_eq!(count_handler.extract(&mut multifruits), 5); + assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2); } } + diff --git a/src/collector/tests.rs b/src/collector/tests.rs new file mode 100644 index 000000000..d67ff14b2 --- /dev/null +++ b/src/collector/tests.rs @@ -0,0 +1,202 @@ +use super::*; +use core::SegmentReader; +use fastfield::BytesFastFieldReader; +use fastfield::FastFieldReader; +use schema::Field; +use DocId; +use Score; +use SegmentLocalId; +use DocAddress; + +/// Stores all of the doc ids. +/// This collector is only used for tests. +/// It is unusable in pr +/// +/// actise, as it does not store +/// the segment ordinals +pub struct TestCollector; + +pub struct TestSegmentCollector { + segment_id: SegmentLocalId, + fruit: TestFruit, +} + +#[derive(Default)] +pub struct TestFruit { + docs: Vec, + scores: Vec +} + +impl TestFruit { + /// Return the list of matching documents exhaustively. + pub fn docs(&self) ->&[DocAddress] { + &self.docs[..] + } + + pub fn scores(&self) -> &[Score] { + &self.scores[..] + } +} + +impl Collector for TestCollector { + type Fruit = TestFruit; + type Child = TestSegmentCollector; + + fn for_segment(&self, segment_id: SegmentLocalId, _reader: &SegmentReader) -> Result { + Ok(TestSegmentCollector { + segment_id, + fruit: TestFruit::default() + }) + } + + fn requires_scoring(&self) -> bool { + true + } + + fn merge_fruits(&self, mut children: Vec) -> Result { + children + .sort_by_key(|fruit| + if fruit.docs().is_empty() { + 0 + } else { + fruit.docs()[0].segment_ord() + }); + let mut docs = vec![]; + let mut scores = vec![]; + for child in children { + docs.extend(child.docs()); + scores.extend(child.scores); + } + Ok(TestFruit { docs, scores }) + } +} + +impl SegmentCollector for TestSegmentCollector { + + type Fruit = TestFruit; + + fn collect(&mut self, doc: DocId, score: Score) { + self.fruit.docs.push(DocAddress(self.segment_id, doc )); + self.fruit.scores.push(score); + } + + fn harvest(self) -> ::Fruit { + self.fruit + } +} + + +/// Collects in order all of the fast fields for all of the +/// doc in the `DocSet` +/// +/// This collector is mainly useful for tests. +pub struct FastFieldTestCollector { + field: Field, +} + +pub struct FastFieldSegmentCollector { + vals: Vec, + reader: FastFieldReader, +} + +impl FastFieldTestCollector { + pub fn for_field(field: Field) -> FastFieldTestCollector { + FastFieldTestCollector { + field, + } + } +} + +impl Collector for FastFieldTestCollector { + + type Fruit = Vec; + type Child = FastFieldSegmentCollector; + + fn for_segment(&self, _: SegmentLocalId, reader: &SegmentReader) -> Result { + Ok(FastFieldSegmentCollector { + vals: Vec::new(), + reader: reader.fast_field_reader(self.field)?, + }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, children: Vec>) -> Result> { + Ok(children + .into_iter() + .flat_map(|v| v.into_iter()) + .collect()) + } +} + +impl SegmentCollector for FastFieldSegmentCollector { + type Fruit = Vec; + + fn collect(&mut self, doc: DocId, _score: Score) { + let val = self.reader.get(doc); + self.vals.push(val); + } + + fn harvest(self) -> Vec { + self.vals + } +} + +/// Collects in order all of the fast field bytes for all of the +/// docs in the `DocSet` +/// +/// This collector is mainly useful for tests. +pub struct BytesFastFieldTestCollector { + field: Field, +} + +pub struct BytesFastFieldSegmentCollector { + vals: Vec, + reader: BytesFastFieldReader, +} + +impl BytesFastFieldTestCollector { + pub fn for_field(field: Field) -> BytesFastFieldTestCollector { + BytesFastFieldTestCollector { field } + } +} + +impl Collector for BytesFastFieldTestCollector { + + type Fruit = Vec; + type Child = BytesFastFieldSegmentCollector; + + fn for_segment(&self, _segment_local_id: u32, segment: &SegmentReader) -> Result { + Ok(BytesFastFieldSegmentCollector { + vals: Vec::new(), + reader: segment.bytes_fast_field_reader(self.field)?, + }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, children: Vec>) -> Result> { + Ok(children + .into_iter() + .flat_map(|c| c.into_iter()) + .collect()) + } +} + +impl SegmentCollector for BytesFastFieldSegmentCollector { + + type Fruit = Vec; + + fn collect(&mut self, doc: u32, _score: f32) { + let data = self.reader.get_val(doc); + self.vals.extend(data); + } + + fn harvest(self) -> ::Fruit { + self.vals + } +} diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 265a6981a..ff81c1fca 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -3,54 +3,57 @@ use std::collections::BinaryHeap; use DocAddress; use DocId; use SegmentLocalId; +use SegmentReader; +use Result; +use serde::export::PhantomData; + /// Contains a feature (field, score, etc.) of a document along with the document address. /// /// It has a custom implementation of `PartialOrd` that reverses the order. This is because the /// default Rust heap is a max heap, whereas a min heap is needed. -#[derive(Clone, Copy)] -pub struct ComparableDoc { +/// +/// WARNING: equality is not what you would expect here. +/// Two elements are equal if their feature is equal, and regardless of whether `doc` +/// is equal. This should be perfectly fine for this usage, but let's make sure this +/// struct is never public. +struct ComparableDoc { feature: T, - doc_address: DocAddress, + doc: D, } -impl PartialOrd for ComparableDoc { +impl PartialOrd for ComparableDoc { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ComparableDoc { +impl Ord for ComparableDoc { #[inline] fn cmp(&self, other: &Self) -> Ordering { other .feature .partial_cmp(&self.feature) - .unwrap_or_else(|| other.doc_address.cmp(&self.doc_address)) + .unwrap_or_else(|| Ordering::Equal) } } -impl PartialEq for ComparableDoc { +impl PartialEq for ComparableDoc { fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } -impl Eq for ComparableDoc {} +impl Eq for ComparableDoc {} -/// The Top Collector keeps track of the K documents -/// sorted by type `T`. -/// -/// The implementation is based on a `BinaryHeap`. -/// The theorical complexity for collecting the top `K` out of `n` documents -/// is `O(n log K)`. -pub struct TopCollector { + +pub(crate) struct TopCollector { limit: usize, - heap: BinaryHeap>, - segment_id: u32, + _marker: PhantomData } -impl TopCollector { +impl TopCollector where T: PartialOrd + Clone { + /// Creates a top collector, with a number of documents equal to "limit". /// /// # Panics @@ -61,127 +64,154 @@ impl TopCollector { } TopCollector { limit, - heap: BinaryHeap::with_capacity(limit), - segment_id: 0, + _marker: PhantomData, } } - /// Returns K best documents sorted in decreasing order. - /// - /// Calling this method triggers the sort. - /// The result of the sort is not cached. - pub fn docs(&self) -> Vec { - self.top_docs() + pub fn limit(&self) -> usize { + self.limit + } + + pub fn merge_fruits(&self, children: Vec>) -> Result> { + if self.limit == 0 { + return Ok(Vec::new()); + } + let mut top_collector = BinaryHeap::new(); + for child_fruit in children { + for (feature, doc) in child_fruit { + if top_collector.len() < self.limit { + top_collector.push(ComparableDoc { + feature, + doc + }); + } else { + if let Some(mut head) = top_collector.peek_mut() { + if head.feature < feature { + *head = ComparableDoc { + feature, + doc + }; + } + } + } + } + } + Ok(top_collector + .into_sorted_vec() .into_iter() - .map(|(_feature, doc)| doc) + .map(|cdoc| (cdoc.feature, cdoc.doc)) + .collect()) + } + + pub(crate) fn for_segment(&self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result> { + Ok(TopSegmentCollector::new(segment_id, self.limit)) + } +} + + +/// The Top Collector keeps track of the K documents +/// sorted by type `T`. +/// +/// The implementation is based on a `BinaryHeap`. +/// The theorical complexity for collecting the top `K` out of `n` documents +/// is `O(n log K)`. +pub(crate) struct TopSegmentCollector { + limit: usize, + heap: BinaryHeap>, + segment_id: u32, +} + +impl TopSegmentCollector { + fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector { + TopSegmentCollector { + limit, + heap: BinaryHeap::with_capacity(limit), + segment_id + } + } +} + +impl TopSegmentCollector { + pub fn harvest(self) -> Vec<(T, DocAddress)> { + let segment_id = self.segment_id; + self.heap.into_sorted_vec() + .into_iter() + .map(|comparable_doc| + (comparable_doc.feature, DocAddress(segment_id, comparable_doc.doc)) ) .collect() } - /// Returns K best FeatureDocuments sorted in decreasing order. - /// - /// Calling this method triggers the sort. - /// The result of the sort is not cached. - pub fn top_docs(&self) -> Vec<(T, DocAddress)> { - let mut feature_docs: Vec> = self.heap.iter().cloned().collect(); - feature_docs.sort(); - feature_docs - .into_iter() - .map( - |ComparableDoc { - feature, - doc_address, - }| (feature, doc_address), - ).collect() - } /// Return true iff at least K documents have gone through /// the collector. - #[inline] - pub fn at_capacity(&self) -> bool { + #[inline(always)] + pub(crate) fn at_capacity(&self) -> bool { self.heap.len() >= self.limit } - /// Sets the segment local ID for the collector - pub fn set_segment_id(&mut self, segment_id: SegmentLocalId) { - self.segment_id = segment_id; - } - /// Collects a document scored by the given feature /// /// It collects documents until it has reached the max capacity. Once it reaches capacity, it /// will compare the lowest scoring item with the given one and keep whichever is greater. + #[inline(always)] pub fn collect(&mut self, doc: DocId, feature: T) { if self.at_capacity() { // It's ok to unwrap as long as a limit of 0 is forbidden. - let limit_doc: ComparableDoc = self - .heap + if let Some(limit_feature) = self.heap .peek() - .expect("Top collector with size 0 is forbidden") - .clone(); - if limit_doc.feature < feature { - let mut mut_head = self - .heap - .peek_mut() - .expect("Top collector with size 0 is forbidden"); - mut_head.feature = feature; - mut_head.doc_address = DocAddress(self.segment_id, doc); + .map(|head| head.feature.clone()) { + if limit_feature < feature { + if let Some(mut head) = self.heap.peek_mut() { + head.feature = feature; + head.doc = doc; + } + } } } else { - let wrapped_doc = ComparableDoc { + // we have not reached capacity yet, so we can just push the + // element. + self.heap.push(ComparableDoc { feature, - doc_address: DocAddress(self.segment_id, doc), - }; - self.heap.push(wrapped_doc); + doc, + }); } } } #[cfg(test)] mod tests { - use super::*; - use DocId; + use super::{TopCollector, TopSegmentCollector}; + use DocAddress; use Score; #[test] fn test_top_collector_not_at_capacity() { - let mut top_collector = TopCollector::with_limit(4); + let mut top_collector = TopSegmentCollector::new(0, 4); top_collector.collect(1, 0.8); top_collector.collect(3, 0.2); top_collector.collect(5, 0.3); - assert!(!top_collector.at_capacity()); - let score_docs: Vec<(Score, DocId)> = top_collector - .top_docs() - .into_iter() - .map(|(score, doc_address)| (score, doc_address.doc())) - .collect(); - assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]); + assert_eq!( + top_collector.harvest(), + vec![(0.8, DocAddress(0,1)), + (0.3, DocAddress(0,5)), + (0.2, DocAddress(0,3))] + ); } #[test] fn test_top_collector_at_capacity() { - let mut top_collector = TopCollector::with_limit(4); + let mut top_collector = TopSegmentCollector::new(0, 4); top_collector.collect(1, 0.8); top_collector.collect(3, 0.2); top_collector.collect(5, 0.3); top_collector.collect(7, 0.9); top_collector.collect(9, -0.2); - assert!(top_collector.at_capacity()); - { - let score_docs: Vec<(Score, DocId)> = top_collector - .top_docs() - .into_iter() - .map(|(score, doc_address)| (score, doc_address.doc())) - .collect(); - assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]); - } - { - let docs: Vec = top_collector - .docs() - .into_iter() - .map(|doc_address| doc_address.doc()) - .collect(); - assert_eq!(docs, vec![7, 1, 5, 3]); - } + assert_eq!( + top_collector.harvest(), + vec![(0.9, DocAddress(0,7)), + (0.8, DocAddress(0,1)), + (0.3, DocAddress(0,5)), + (0.2, DocAddress(0,3))]); } #[test] @@ -189,5 +219,4 @@ mod tests { fn test_top_0() { let _collector: TopCollector = TopCollector::with_limit(0); } - } diff --git a/src/collector/top_field_collector.rs b/src/collector/top_field_collector.rs index 3fb95d21a..f57d747b2 100644 --- a/src/collector/top_field_collector.rs +++ b/src/collector/top_field_collector.rs @@ -1,13 +1,14 @@ use super::Collector; use collector::top_collector::TopCollector; +use collector::SegmentCollector; use fastfield::FastFieldReader; use fastfield::FastValue; use schema::Field; -use DocAddress; -use DocId; use Result; -use Score; use SegmentReader; +use SegmentLocalId; +use collector::top_collector::TopSegmentCollector; +use DocAddress; /// The Top Field Collector keeps track of the K documents /// sorted by a fast field in the index @@ -19,67 +20,57 @@ use SegmentReader; /// ```rust /// #[macro_use] /// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT, FAST}; -/// use tantivy::{Index, Result, DocId}; -/// use tantivy::collector::TopFieldCollector; -/// use tantivy::query::QueryParser; +/// # use tantivy::schema::{Schema, Field, FAST, TEXT}; +/// # use tantivy::{Index, Result, DocAddress}; +/// # use tantivy::query::{Query, QueryParser}; +/// use tantivy::collector::TopDocs; /// -/// # fn main() { example().unwrap(); } -/// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); -/// let title = schema_builder.add_text_field("title", TEXT); -/// let rating = schema_builder.add_u64_field("rating", FAST); -/// let schema = schema_builder.build(); -/// let index = Index::create_in_ram(schema); -/// { -/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; -/// index_writer.add_document(doc!( -/// title => "The Name of the Wind", -/// rating => 92u64, -/// )); -/// index_writer.add_document(doc!( -/// title => "The Diary of Muadib", -/// rating => 97u64, -/// )); -/// index_writer.add_document(doc!( -/// title => "A Dairy Cow", -/// rating => 63u64, -/// )); -/// index_writer.add_document(doc!( -/// title => "The Diary of a Young Girl", -/// rating => 80u64, -/// )); -/// index_writer.commit().unwrap(); -/// } +/// # fn main() { +/// # let mut schema_builder = Schema::builder(); +/// # let title = schema_builder.add_text_field("title", TEXT); +/// # let rating = schema_builder.add_u64_field("rating", FAST); +/// # let schema = schema_builder.build(); +/// # let index = Index::create_in_ram(schema); +/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); +/// # index_writer.add_document(doc!( +/// # title => "The Name of the Wind", +/// # rating => 92u64, +/// # )); +/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64)); +/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64)); +/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64)); +/// # index_writer.commit().unwrap(); +/// # index.load_searchers().unwrap(); +/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary").unwrap(); +/// # let top_docs = docs_sorted_by_rating(&index, &query, rating).unwrap(); +/// # assert_eq!(top_docs, +/// # vec![(97u64, DocAddress(0u32, 1)), +/// # (80u64, DocAddress(0u32, 3))]); +/// # } +/// # +/// /// Searches the document matching the given query, and +/// /// collects the top 10 documents, order by the `field` +/// /// given in argument. +/// /// +/// /// `field` is required to be a FAST field. +/// fn docs_sorted_by_rating(index: &Index, query: &Query, sort_by_field: Field) +/// -> Result> { /// -/// index.load_searchers()?; -/// let searcher = index.searcher(); +/// // This is where we build our collector! +/// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field); /// -/// { -/// let mut top_collector = TopFieldCollector::with_limit(rating, 2); -/// let query_parser = QueryParser::for_index(&index, vec![title]); -/// let query = query_parser.parse_query("diary")?; -/// searcher.search(&*query, &mut top_collector).unwrap(); -/// -/// let score_docs: Vec<(u64, DocId)> = top_collector -/// .top_docs() -/// .into_iter() -/// .map(|(field, doc_address)| (field, doc_address.doc())) -/// .collect(); -/// -/// assert_eq!(score_docs, vec![(97u64, 1), (80, 3)]); -/// } -/// -/// Ok(()) +/// // ... and here is our documents. Not this is a simple vec. +/// // The `u64` in the pair is the value of our fast field for each documents. +/// index.searcher() +/// .search(query, &top_docs_by_rating) /// } /// ``` -pub struct TopFieldCollector { - field: Field, +pub struct TopDocsByField { collector: TopCollector, - fast_field: Option>, + field: Field } -impl TopFieldCollector { +impl TopDocsByField { /// Creates a top field collector, with a number of documents equal to "limit". /// /// The given field name must be a fast field, otherwise the collector have an error while @@ -87,78 +78,76 @@ impl TopFieldCollector { /// /// # Panics /// The method panics if limit is 0 - pub fn with_limit(field: Field, limit: usize) -> Self { - TopFieldCollector { - field, + pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField { + TopDocsByField { collector: TopCollector::with_limit(limit), - fast_field: None, + field } } - - /// Returns K best documents sorted the given field name in decreasing order. - /// - /// Calling this method triggers the sort. - /// The result of the sort is not cached. - pub fn docs(&self) -> Vec { - self.collector.docs() - } - - /// Returns K best FieldDocuments sorted in decreasing order. - /// - /// Calling this method triggers the sort. - /// The result of the sort is not cached. - pub fn top_docs(&self) -> Vec<(T, DocAddress)> { - self.collector.top_docs() - } - - /// Return true iff at least K documents have gone through - /// the collector. - #[inline] - pub fn at_capacity(&self) -> bool { - self.collector.at_capacity() - } } -impl Collector for TopFieldCollector { - fn set_segment(&mut self, segment_id: u32, segment: &SegmentReader) -> Result<()> { - self.collector.set_segment_id(segment_id); - self.fast_field = Some(segment.fast_field_reader(self.field)?); - Ok(()) - } - fn collect(&mut self, doc: DocId, _score: Score) { - let field_value = self - .fast_field - .as_ref() - .expect("collect() was called before set_segment. This should never happen.") - .get(doc); - self.collector.collect(doc, field_value); +impl Collector for TopDocsByField { + + type Fruit = Vec<(T, DocAddress)>; + + type Child = TopFieldSegmentCollector; + + fn for_segment(&self, segment_local_id: SegmentLocalId, reader: &SegmentReader) -> Result> { + let collector = self.collector.for_segment(segment_local_id, reader)?; + let reader = reader.fast_field_reader(self.field)?; + Ok(TopFieldSegmentCollector { collector, reader }) } fn requires_scoring(&self) -> bool { false } + + fn merge_fruits(&self, segment_fruits: Vec>) -> Result> { + self.collector.merge_fruits(segment_fruits) + } +} + +pub struct TopFieldSegmentCollector { + collector: TopSegmentCollector, + reader: FastFieldReader, +} + +impl SegmentCollector for TopFieldSegmentCollector { + + type Fruit = Vec<(T, DocAddress)>; + + fn collect(&mut self, doc: u32, _score: f32) { + let field_value = self.reader.get(doc); + self.collector.collect(doc, field_value); + } + + fn harvest(self) -> Vec<(T, DocAddress)> { + self.collector.harvest() + } } #[cfg(test)] mod tests { - use super::*; + use super::TopDocsByField; use query::Query; use query::QueryParser; use schema::Field; use schema::IntOptions; - use schema::Schema; - use schema::{SchemaBuilder, FAST, TEXT}; + use schema::{Schema, FAST, TEXT}; use Index; use IndexWriter; use TantivyError; + use collector::Collector; + use DocAddress; + use collector::TopDocs; const TITLE: &str = "title"; const SIZE: &str = "size"; #[test] fn test_top_collector_not_at_capacity() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field(TITLE, TEXT); let size = schema_builder.add_u64_field(SIZE, FAST); let schema = schema_builder.build(); @@ -178,22 +167,18 @@ mod tests { }); let searcher = index.searcher(); - let mut top_collector = TopFieldCollector::with_limit(size, 4); - searcher.search(&*query, &mut top_collector).unwrap(); - assert!(!top_collector.at_capacity()); - - let score_docs: Vec<(u64, DocId)> = top_collector - .top_docs() - .into_iter() - .map(|(field, doc_address)| (field, doc_address.doc())) - .collect(); - assert_eq!(score_docs, vec![(64, 1), (16, 2), (12, 0)]); + let top_collector = TopDocs::with_limit(4).order_by_field(size); + let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap(); + assert_eq!(top_docs, vec![ + (64, DocAddress(0,1)), + (16, DocAddress(0,2)), + (12, DocAddress(0,0))]); } #[test] #[should_panic] fn test_field_does_not_exist() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field(TITLE, TEXT); let size = schema_builder.add_u64_field(SIZE, FAST); let schema = schema_builder.build(); @@ -204,14 +189,16 @@ mod tests { )); }); let searcher = index.searcher(); - let segment = searcher.segment_reader(0); - let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(Field(2), 4); - let _ = top_collector.set_segment(0, segment); + let top_collector: TopDocsByField = + TopDocs::with_limit(4).order_by_field(Field(2)); + let segment_reader = searcher.segment_reader(0u32); + top_collector.for_segment(0, segment_reader) + .expect("should panic"); } #[test] fn test_field_not_fast_field() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field(TITLE, TEXT); let size = schema_builder.add_u64_field(SIZE, IntOptions::default()); let schema = schema_builder.build(); @@ -223,26 +210,13 @@ mod tests { }); let searcher = index.searcher(); let segment = searcher.segment_reader(0); - let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(size, 4); + let top_collector: TopDocsByField = TopDocs::with_limit(4).order_by_field(size); assert_matches!( - top_collector.set_segment(0, segment), - Err(TantivyError::FastFieldError(_)) + top_collector.for_segment(0, segment).map(|_| ()).unwrap_err(), + TantivyError::FastFieldError(_) ); } - #[test] - #[should_panic] - fn test_collect_before_set_segment() { - let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(Field(0), 4); - top_collector.collect(0, 0f32); - } - - #[test] - #[should_panic] - fn test_top_0() { - let _: TopFieldCollector = TopFieldCollector::with_limit(Field(0), 0); - } - fn index( query: &str, query_field: Field, diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 68bf114f6..ccc976ceb 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -1,11 +1,16 @@ use super::Collector; -use collector::top_collector::TopCollector; -use DocAddress; +use collector::top_collector::TopSegmentCollector; use DocId; use Result; use Score; use SegmentLocalId; use SegmentReader; +use collector::SegmentCollector; +use collector::top_collector::TopCollector; +use DocAddress; +use collector::TopDocsByField; +use schema::Field; +use fastfield::FastValue; /// The Top Score Collector keeps track of the K documents /// sorted by their score. @@ -17,14 +22,15 @@ use SegmentReader; /// ```rust /// #[macro_use] /// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT}; -/// use tantivy::{Index, Result, DocId, Score}; -/// use tantivy::collector::TopScoreCollector; +/// use tantivy::DocAddress; +/// use tantivy::schema::{Schema, TEXT}; +/// use tantivy::{Index, Result}; +/// use tantivy::collector::TopDocs; /// use tantivy::query::QueryParser; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); +/// let mut schema_builder = Schema::builder(); /// let title = schema_builder.add_text_field("title", TEXT); /// let schema = schema_builder.build(); /// let index = Index::create_in_ram(schema); @@ -48,140 +54,136 @@ use SegmentReader; /// index.load_searchers()?; /// let searcher = index.searcher(); /// -/// { -/// let mut top_collector = TopScoreCollector::with_limit(2); -/// let query_parser = QueryParser::for_index(&index, vec![title]); -/// let query = query_parser.parse_query("diary")?; -/// searcher.search(&*query, &mut top_collector).unwrap(); +/// let query_parser = QueryParser::for_index(&index, vec![title]); +/// let query = query_parser.parse_query("diary")?; +/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?; /// -/// let score_docs: Vec<(Score, DocId)> = top_collector -/// .top_docs() -/// .into_iter() -/// .map(|(score, doc_address)| (score, doc_address.doc())) -/// .collect(); -/// -/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]); -/// } +/// assert_eq!(&top_docs[0], &(0.7261542, DocAddress(0, 1))); +/// assert_eq!(&top_docs[1], &(0.6099695, DocAddress(0, 3))); /// /// Ok(()) /// } /// ``` -pub struct TopScoreCollector { - collector: TopCollector, -} +pub struct TopDocs(TopCollector); -impl TopScoreCollector { + +impl TopDocs { /// Creates a top score collector, with a number of documents equal to "limit". /// /// # Panics /// The method panics if limit is 0 - pub fn with_limit(limit: usize) -> TopScoreCollector { - TopScoreCollector { - collector: TopCollector::with_limit(limit), - } + pub fn with_limit(limit: usize) -> TopDocs { + TopDocs(TopCollector::with_limit(limit)) } - /// Returns K best scored documents sorted in decreasing order. + /// Set top-K to rank documents by a given fast field. /// - /// Calling this method triggers the sort. - /// The result of the sort is not cached. - pub fn docs(&self) -> Vec { - self.collector.docs() - } - - /// Returns K best ScoredDocuments sorted in decreasing order. - /// - /// Calling this method triggers the sort. - /// The result of the sort is not cached. - pub fn top_docs(&self) -> Vec<(Score, DocAddress)> { - self.collector.top_docs() - } - - /// Returns K best ScoredDocuments sorted in decreasing order. - /// - /// Calling this method triggers the sort. - /// The result of the sort is not cached. - #[deprecated] - pub fn score_docs(&self) -> Vec<(Score, DocAddress)> { - self.collector.top_docs() - } - - /// Return true iff at least K documents have gone through - /// the collector. - #[inline] - pub fn at_capacity(&self) -> bool { - self.collector.at_capacity() + /// (By default, `TopDocs` collects the top-K documents sorted by + /// the similarity score.) + pub fn order_by_field(self, field: Field) -> TopDocsByField { + TopDocsByField::new(field, self.0.limit()) } } -impl Collector for TopScoreCollector { - fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> { - self.collector.set_segment_id(segment_id); - Ok(()) - } +impl Collector for TopDocs { - fn collect(&mut self, doc: DocId, score: Score) { - self.collector.collect(doc, score); + type Fruit = Vec<(Score, DocAddress)>; + + type Child = TopScoreSegmentCollector; + + fn for_segment(&self, segment_local_id: SegmentLocalId, reader: &SegmentReader) -> Result { + let collector = self.0.for_segment(segment_local_id, reader)?; + Ok(TopScoreSegmentCollector(collector)) } fn requires_scoring(&self) -> bool { true } + + fn merge_fruits(&self, child_fruits: Vec>) -> Result { + self.0.merge_fruits(child_fruits) + } } +/// Segment Collector associated to `TopDocs`. +pub struct TopScoreSegmentCollector(TopSegmentCollector); + +impl SegmentCollector for TopScoreSegmentCollector { + type Fruit = Vec<(Score, DocAddress)>; + + fn collect(&mut self, doc: DocId, score: Score) { + self.0.collect(doc, score) + } + + fn harvest(self) -> Vec<(Score, DocAddress)> { + self.0.harvest() + } +} + + + + #[cfg(test)] mod tests { - use super::*; - use collector::Collector; - use DocId; + use super::TopDocs; use Score; + use schema::Schema; + use Index; + use schema::TEXT; + use query::QueryParser; + use DocAddress; + + fn make_index() -> Index { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.add_document(doc!(text_field=>"Hello happy tax payer.")); + index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer")); + index_writer.add_document(doc!(text_field=>"I like Droopy")); + assert!(index_writer.commit().is_ok()); + } + index.load_searchers().unwrap(); + index + } + #[test] fn test_top_collector_not_at_capacity() { - let mut top_collector = TopScoreCollector::with_limit(4); - top_collector.collect(1, 0.8); - top_collector.collect(3, 0.2); - top_collector.collect(5, 0.3); - assert!(!top_collector.at_capacity()); - let score_docs: Vec<(Score, DocId)> = top_collector - .top_docs() - .into_iter() - .map(|(score, doc_address)| (score, doc_address.doc())) - .collect(); - assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]); + let index = make_index(); + let field = index.schema().get_field("text").unwrap(); + let query_parser = QueryParser::for_index(&index, vec![field]); + let text_query = query_parser.parse_query("droopy tax").unwrap(); + let score_docs: Vec<(Score, DocAddress)> = index.searcher().search(&text_query, &TopDocs::with_limit(4)).unwrap(); + assert_eq!(score_docs, vec![ + (0.81221175, DocAddress(0u32, 1)), + (0.5376842, DocAddress(0u32, 2)), + (0.48527452, DocAddress(0, 0)) + ]); } + #[test] fn test_top_collector_at_capacity() { - let mut top_collector = TopScoreCollector::with_limit(4); - top_collector.collect(1, 0.8); - top_collector.collect(3, 0.2); - top_collector.collect(5, 0.3); - top_collector.collect(7, 0.9); - top_collector.collect(9, -0.2); - assert!(top_collector.at_capacity()); - { - let score_docs: Vec<(Score, DocId)> = top_collector - .top_docs() - .into_iter() - .map(|(score, doc_address)| (score, doc_address.doc())) - .collect(); - assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]); - } - { - let docs: Vec = top_collector - .docs() - .into_iter() - .map(|doc_address| doc_address.doc()) - .collect(); - assert_eq!(docs, vec![7, 1, 5, 3]); - } + let index = make_index(); + let field = index.schema().get_field("text").unwrap(); + let query_parser = QueryParser::for_index(&index, vec![field]); + let text_query = query_parser.parse_query("droopy tax").unwrap(); + let score_docs: Vec<(Score, DocAddress)> = index.searcher().search(&text_query, &TopDocs::with_limit(2)).unwrap(); + assert_eq!(score_docs, vec![ + (0.81221175, DocAddress(0u32, 1)), + (0.5376842, DocAddress(0u32, 2)), + ]); } #[test] #[should_panic] fn test_top_0() { - TopScoreCollector::with_limit(0); + TopDocs::with_limit(0); } } + diff --git a/src/core/executor.rs b/src/core/executor.rs new file mode 100644 index 000000000..490d68630 --- /dev/null +++ b/src/core/executor.rs @@ -0,0 +1,104 @@ +use Result; +use scoped_pool::{Pool, ThreadConfig}; +use crossbeam::channel; + +/// Search executor whether search request are single thread or multithread. +/// +/// We don't expose Rayon thread pool directly here for several reasons. +/// +/// First dependency hell. It is not a good idea to expose the +/// API of a dependency, knowing it might conflict with a different version +/// used by the client. Second, we may stop using rayon in the future. +pub enum Executor { + SingleThread, + ThreadPool(Pool), +} + +impl Executor { + /// Creates an Executor that performs all task in the caller thread. + pub fn single_thread() -> Executor { + Executor::SingleThread + } + + // Creates an Executor that dispatches the tasks in a thread pool. + pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Executor { + let thread_config = ThreadConfig::new().prefix(prefix); + let pool = Pool::with_thread_config(num_threads, thread_config); + Executor::ThreadPool(pool) + } + + // Perform a map in the thread pool. + // + // Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task + // will propagate to the caller. + pub fn map_unsorted, F: Sized + Sync + Fn(A) -> Result>(&self, f: F, args: AIterator) -> Result> { + match self { + Executor::SingleThread => { + args.map(f).collect::>() + } + Executor::ThreadPool(pool) => { + let fruit_receiver = { + let (fruit_sender, fruit_receiver) = channel::unbounded(); + pool.scoped(|scope| { + for arg in args { + scope.execute(|| { + let fruit = f(arg); + if let Err(err) = fruit_sender.send(fruit) { + error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err); + } + }); + } + }); + fruit_receiver + // This ends the scope of fruit_sender. + // This is important as it makes it possible for the fruit_receiver iteration to + // terminate. + }; + fruit_receiver + .into_iter() + .collect::>() + } + } + } +} + +#[cfg(test)] +mod tests { + + use super::Executor; + + #[test] + #[should_panic(expected="panic should propagate")] + fn test_panic_propagates_single_thread() { + let _result: Vec = Executor::single_thread().map_unsorted(|_| {panic!("panic should propagate"); }, vec![0].into_iter()).unwrap(); + } + + #[test] + #[should_panic] //< unfortunately the panic message is not propagated + fn test_panic_propagates_multi_thread() { + let _result: Vec = Executor::multi_thread(1, "search-test") + .map_unsorted(|_| {panic!("panic should propagate"); }, vec![0].into_iter()).unwrap(); + } + + #[test] + fn test_map_singlethread() { + let result: Vec = Executor::single_thread() + .map_unsorted(|i| { Ok(i * 2) }, 0..1_000).unwrap(); + assert_eq!(result.len(), 1_000); + for i in 0..1_000 { + assert_eq!(result[i], i * 2); + } + } + + #[test] + fn test_map_multithread() { + let mut result: Vec = Executor::multi_thread(3, "search-test") + .map_unsorted(|i| Ok(i * 2), 0..10).unwrap(); + assert_eq!(result.len(), 10); + result.sort(); + for i in 0..10 { + assert_eq!(result[i], i * 2); + } + } + +} \ No newline at end of file diff --git a/src/core/index.rs b/src/core/index.rs index 103db95c1..e1ea363d6 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -31,6 +31,7 @@ use tokenizer::BoxedTokenizer; use tokenizer::TokenizerManager; use IndexWriter; use Result; +use core::Executor; fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; @@ -45,6 +46,7 @@ pub struct Index { schema: Schema, num_searchers: Arc, searcher_pool: Arc>, + executor: Arc, tokenizers: TokenizerManager, } @@ -54,6 +56,29 @@ impl Index { dir.exists(&META_FILEPATH) } + /// Accessor to the search executor. + /// + /// This pool is used by default when calling `searcher.search(...)` + /// to perform search on the individual segments. + /// + /// By default the executor is single thread, and simply runs in the calling thread. + pub fn search_executor(&self) -> &Executor { + self.executor.as_ref() + } + + /// Replace the default single thread search executor pool + /// by a thread pool with a given number of threads. + pub fn set_multithread_executor(&mut self, num_threads: usize) { + self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-")); + } + + /// Replace the default single thread search executor pool + /// by a thread pool with a given number of threads. + pub fn set_default_multithread_executor(&mut self) { + let default_num_threads = num_cpus::get(); + self.set_multithread_executor(default_num_threads); + } + /// Creates a new index using the `RAMDirectory`. /// /// The index will be allocated in anonymous memory. @@ -131,6 +156,7 @@ impl Index { num_searchers: Arc::new(AtomicUsize::new(n_cpus)), searcher_pool: Arc::new(Pool::new()), tokenizers: TokenizerManager::default(), + executor: Arc::new(Executor::single_thread()), }; index.load_searchers()?; Ok(index) @@ -348,19 +374,20 @@ impl Clone for Index { num_searchers: Arc::clone(&self.num_searchers), searcher_pool: Arc::clone(&self.searcher_pool), tokenizers: self.tokenizers.clone(), + executor: self.executor.clone(), } } } #[cfg(test)] mod tests { - use schema::{Schema, SchemaBuilder, INT_INDEXED, TEXT}; + use schema::{Schema, INT_INDEXED, TEXT}; use Index; use directory::RAMDirectory; #[test] fn test_indexer_for_field() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED); let body_field = schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); @@ -402,7 +429,7 @@ mod tests { let directory = RAMDirectory::create(); assert!(Index::create(directory.clone(), throw_away_schema()).is_ok()); assert!(Index::exists(&directory)); - assert!(Index::create(directory.clone(), SchemaBuilder::default().build()).is_ok()); + assert!(Index::create(directory.clone(), Schema::builder().build()).is_ok()); } #[test] @@ -411,12 +438,12 @@ mod tests { assert!(Index::create(directory.clone(), throw_away_schema()).is_ok()); assert!(Index::exists(&directory)); assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok()); - let err = Index::open_or_create(directory, SchemaBuilder::default().build()); + let err = Index::open_or_create(directory, Schema::builder().build()); assert_eq!(format!("{:?}", err.unwrap_err()), "SchemaError(\"An index exists but the schema does not match.\")"); } fn throw_away_schema() -> Schema { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED); schema_builder.build() } diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index ecef75d02..c5d33e370 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -46,13 +46,13 @@ impl fmt::Debug for IndexMeta { mod tests { use super::IndexMeta; - use schema::{SchemaBuilder, TEXT}; + use schema::{Schema, TEXT}; use serde_json; #[test] fn test_serialize_metas() { let schema = { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); schema_builder.add_text_field("text", TEXT); schema_builder.build() }; diff --git a/src/core/mod.rs b/src/core/mod.rs index 062b537ee..fa9772790 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -7,8 +7,10 @@ mod segment; mod segment_component; mod segment_id; mod segment_meta; +mod executor; mod segment_reader; +pub use self::executor::Executor; pub use self::index::Index; pub use self::index_meta::IndexMeta; pub use self::inverted_index_reader::InvertedIndexReader; diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 826bf4501..acece093f 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -12,6 +12,29 @@ use termdict::TermMerger; use DocAddress; use Index; use Result; +use store::StoreReader; +use query::Weight; +use query::Scorer; +use collector::SegmentCollector; +use core::Executor; + +fn collect_segment(collector: &C, + weight: &Weight, + segment_ord: u32, + segment_reader: &SegmentReader) -> Result { + let mut scorer = weight.scorer(segment_reader)?; + let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?; + if let Some(delete_bitset) = segment_reader.delete_bitset() { + scorer.for_each(&mut |doc, score| + if !delete_bitset.is_deleted(doc) { + segment_collector.collect(doc, score); + }); + } else { + scorer.for_each(&mut |doc, score| + segment_collector.collect(doc, score)); + } + Ok(segment_collector.harvest()) +} /// Holds a list of `SegmentReader`s ready for search. /// @@ -22,6 +45,7 @@ pub struct Searcher { schema: Schema, index: Index, segment_readers: Vec, + store_readers: Vec } impl Searcher { @@ -29,12 +53,15 @@ impl Searcher { pub(crate) fn new( schema: Schema, index: Index, - segment_readers: Vec, - ) -> Searcher { + segment_readers: Vec) -> Searcher { + let store_readers = segment_readers.iter() + .map(|segment_reader| segment_reader.get_store_reader()) + .collect(); Searcher { schema, index, segment_readers, + store_readers } } @@ -49,8 +76,8 @@ impl Searcher { /// the request to the right `Segment`. pub fn doc(&self, doc_address: DocAddress) -> Result { let DocAddress(segment_local_id, doc_id) = doc_address; - let segment_reader = &self.segment_readers[segment_local_id as usize]; - segment_reader.doc(doc_id) + let store_reader = &self.store_readers[segment_local_id as usize]; + store_reader.get(doc_id) } /// Access the schema associated to the index of this searcher. @@ -86,9 +113,50 @@ impl Searcher { &self.segment_readers[segment_ord as usize] } - /// Runs a query on the segment readers wrapped by the searcher - pub fn search(&self, query: &Query, collector: &mut C) -> Result<()> { - query.search(self, collector) + /// Runs a query on the segment readers wrapped by the searcher. + /// + /// Search works as follows : + /// + /// First the weight object associated to the query is created. + /// + /// Then, the query loops over the segments and for each segment : + /// - setup the collector and informs it that the segment being processed has changed. + /// - creates a SegmentCollector for collecting documents associated to the segment + /// - creates a `Scorer` object associated for this segment + /// - iterate through the matched documents and push them to the segment collector. + /// + /// Finally, the Collector merges each of the child collectors into itself for result usability + /// by the caller. + pub fn search(&self, query: &Query, collector: &C) -> Result { + let executor = self.index.search_executor(); + self.search_with_executor(query, collector, executor) + } + + /// Same as [`search(...)`](#method.search) but multithreaded. + /// + /// The current implementation is rather naive : + /// multithreading is by splitting search into as many task + /// as there are segments. + /// + /// It is powerless at making search faster if your index consists in + /// one large segment. + /// + /// Also, keep in my multithreading a single query on several + /// threads will not improve your throughput. It can actually + /// hurt it. It will however, decrease the average response time. + pub fn search_with_executor(&self, + query: &Query, + collector: &C, + executor: &Executor) -> Result { + let scoring_enabled = collector.requires_scoring(); + let weight = query.weight(self, scoring_enabled)?; + let segment_readers = self.segment_readers(); + let fruits = executor + .map_unsorted(|(segment_ord, segment_reader)| { + collect_segment(collector, weight.as_ref(), segment_ord as u32, segment_reader) + }, + segment_readers.iter().enumerate())?; + collector.merge_fruits(fruits) } /// Return the field searcher associated to a `Field`. diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 54b465e77..e08148c27 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -12,7 +12,6 @@ use fastfield::{self, FastFieldNotAvailableError}; use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader}; use fieldnorm::FieldNormReader; use schema::Cardinality; -use schema::Document; use schema::Field; use schema::FieldType; use schema::Schema; @@ -25,6 +24,7 @@ use store::StoreReader; use termdict::TermDictionary; use DocId; use Result; +use directory::ReadOnlySource; /// Entry point to access all of the datastructures of the `Segment` /// @@ -54,7 +54,7 @@ pub struct SegmentReader { fast_fields_composite: CompositeFile, fieldnorms_composite: CompositeFile, - store_reader: StoreReader, + store_source: ReadOnlySource, delete_bitset_opt: Option, schema: Schema, } @@ -197,8 +197,7 @@ impl SegmentReader { /// Accessor to the segment's `Field norms`'s reader. /// /// Field norms are the length (in tokens) of the fields. - /// It is used in the computation of the [TfIdf] - /// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html). + /// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html). /// /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. @@ -216,8 +215,8 @@ impl SegmentReader { } /// Accessor to the segment's `StoreReader`. - pub fn get_store_reader(&self) -> &StoreReader { - &self.store_reader + pub fn get_store_reader(&self) -> StoreReader { + StoreReader::from_source(self.store_source.clone()) } /// Open a new segment for reading. @@ -226,7 +225,6 @@ impl SegmentReader { let termdict_composite = CompositeFile::open(&termdict_source)?; let store_source = segment.open_read(SegmentComponent::STORE)?; - let store_reader = StoreReader::from_source(store_source); fail_point!("SegmentReader::open#middle"); @@ -272,7 +270,7 @@ impl SegmentReader { fast_fields_composite, fieldnorms_composite, segment_id: segment.id(), - store_reader, + store_source, delete_bitset_opt, positions_composite, positions_idx_composite, @@ -351,14 +349,6 @@ impl SegmentReader { inv_idx_reader } - /// Returns the document (or to be accurate, its stored field) - /// bearing the given doc id. - /// This method is slow and should seldom be called from - /// within a collector. - pub fn doc(&self, doc_id: DocId) -> Result { - self.store_reader.get(doc_id) - } - /// Returns the segment id pub fn segment_id(&self) -> SegmentId { self.segment_id @@ -393,7 +383,7 @@ impl SegmentReader { self.positions_idx_composite.space_usage(), self.fast_fields_composite.space_usage(), self.fieldnorms_composite.space_usage(), - self.store_reader.space_usage(), + self.get_store_reader().space_usage(), self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0), ) } @@ -454,12 +444,12 @@ impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> { #[cfg(test)] mod test { use core::Index; - use schema::{SchemaBuilder, Term, STORED, TEXT}; + use schema::{Schema, Term, STORED, TEXT}; use DocId; #[test] fn test_alive_docs_iterator() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); schema_builder.add_text_field("name", TEXT | STORED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); diff --git a/src/error.rs b/src/error.rs index c651ad70f..5fe30712f 100644 --- a/src/error.rs +++ b/src/error.rs @@ -15,13 +15,13 @@ use std::sync::PoisonError; #[derive(Debug, Fail)] pub enum TantivyError { /// Path does not exist. - #[fail(display = "path does not exist: '{:?}'", _0)] + #[fail(display = "Path does not exist: '{:?}'", _0)] PathDoesNotExist(PathBuf), /// File already exists, this is a problem when we try to write into a new file. - #[fail(display = "file already exists: '{:?}'", _0)] + #[fail(display = "File already exists: '{:?}'", _0)] FileAlreadyExists(PathBuf), /// Index already exists in this directory - #[fail(display = "index already exists")] + #[fail(display = "Index already exists")] IndexAlreadyExists, /// Failed to acquire file lock #[fail( @@ -30,28 +30,29 @@ pub enum TantivyError { )] LockFailure(LockType), /// IO Error. - #[fail(display = "an IO error occurred: '{}'", _0)] + #[fail(display = "An IO error occurred: '{}'", _0)] IOError(#[cause] IOError), - /// The data within is corrupted. - /// - /// For instance, it contains invalid JSON. - #[fail(display = "file contains corrupted data: '{:?}'", _0)] + /// Data corruption. + #[fail(display = "File contains corrupted data: '{:?}'", _0)] CorruptedFile(PathBuf), /// A thread holding the locked panicked and poisoned the lock. - #[fail(display = "a thread holding the locked panicked and poisoned the lock")] + #[fail(display = "A thread holding the locked panicked and poisoned the lock")] Poisoned, /// Invalid argument was passed by the user. - #[fail(display = "an invalid argument was passed: '{}'", _0)] + #[fail(display = "An invalid argument was passed: '{}'", _0)] InvalidArgument(String), /// An Error happened in one of the thread. - #[fail(display = "an error occurred in a thread: '{}'", _0)] + #[fail(display = "An error occurred in a thread: '{}'", _0)] ErrorInThread(String), /// An Error appeared related to the schema. #[fail(display = "Schema error: '{}'", _0)] SchemaError(String), /// Tried to access a fastfield reader for a field not configured accordingly. - #[fail(display = "fast field not available: '{:?}'", _0)] + #[fail(display = "Fast field not available: '{:?}'", _0)] FastFieldError(#[cause] FastFieldNotAvailableError), + /// System error. (e.g.: We failed spawning a new thread) + #[fail(display = "System error.'{}'", _0)] + SystemError(String), } impl From for TantivyError { diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index 48c021f7c..1a551ecc0 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -6,12 +6,12 @@ pub use self::writer::BytesFastFieldWriter; #[cfg(test)] mod tests { - use schema::SchemaBuilder; + use schema::Schema; use Index; #[test] fn test_bytes() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let field = schema_builder.add_bytes_field("bytesfield"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 02694e9f5..6111bfe25 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -133,13 +133,13 @@ mod tests { use rand::prelude::SliceRandom; use schema::Field; use schema::FAST; - use schema::{Schema, SchemaBuilder}; + use schema::Schema; use std::collections::HashMap; use std::path::Path; lazy_static! { pub static ref SCHEMA: Schema = { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); schema_builder.add_u64_field("field", FAST); schema_builder.build() }; @@ -298,7 +298,7 @@ mod tests { fn test_signed_intfastfield() { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let i64_field = schema_builder.add_i64_field("field", FAST); let schema = schema_builder.build(); @@ -342,7 +342,7 @@ mod tests { fn test_signed_intfastfield_default_val() { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let i64_field = schema_builder.add_i64_field("field", FAST); let schema = schema_builder.build(); diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 3d78e46a0..733f211bb 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -9,12 +9,12 @@ mod tests { use schema::Cardinality; use schema::IntOptions; - use schema::SchemaBuilder; + use schema::Schema; use Index; #[test] fn test_multivalued_u64() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let field = schema_builder.add_u64_field( "multifield", IntOptions::default().set_fast(Cardinality::MultiValues), @@ -49,7 +49,7 @@ mod tests { #[test] fn test_multivalued_i64() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let field = schema_builder.add_i64_field( "multifield", IntOptions::default().set_fast(Cardinality::MultiValues), diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index deae3a61f..e56644faf 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -47,11 +47,11 @@ impl MultiValueIntFastFieldReader { mod tests { use core::Index; - use schema::{Document, Facet, SchemaBuilder}; + use schema::{Document, Facet, Schema}; #[test] fn test_multifastfield_reader() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let facet_field = schema_builder.add_facet_field("facets"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 6df8e3775..d725c0811 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -7,7 +7,7 @@ use directory::ReadOnlySource; use directory::{Directory, RAMDirectory, WritePtr}; use fastfield::{FastFieldSerializer, FastFieldsWriter}; use owning_ref::OwningRef; -use schema::SchemaBuilder; +use schema::Schema; use schema::FAST; use std::collections::HashMap; use std::marker::PhantomData; @@ -108,7 +108,7 @@ impl FastFieldReader { impl From> for FastFieldReader { fn from(vals: Vec) -> FastFieldReader { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let field = schema_builder.add_u64_field("field", FAST); let schema = schema_builder.build(); let path = Path::new("__dummy__"); diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index 4746c1407..12370608d 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -15,7 +15,7 @@ //! precompute computationally expensive functions of the fieldnorm //! in a very short array. //! -//! This trick is used by the [BM25 similarity](). +//! This trick is used by the BM25 similarity. mod code; mod reader; mod serializer; diff --git a/src/functional_test.rs b/src/functional_test.rs index f62bd039f..7e023c29e 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -15,7 +15,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet) { #[ignore] #[cfg(feature = "mmap")] fn test_indexing() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let id_field = schema_builder.add_u64_field("id", INT_INDEXED); let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 78d84085f..4c800403a 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -387,10 +387,8 @@ impl IndexWriter { let mem_budget = self.heap_size_in_bytes_per_thread; let join_handle: JoinHandle> = thread::Builder::new() - .name(format!( - "indexing thread {} for gen {}", - self.worker_id, generation - )).spawn(move || { + .name(format!("thrd-tantivy-index{}-gen{}", self.worker_id, generation)) + .spawn(move || { loop { let mut document_iterator = document_receiver_clone.clone().into_iter().peekable(); @@ -660,7 +658,7 @@ mod tests { #[test] fn test_lockfile_stops_duplicates() { - let schema_builder = schema::SchemaBuilder::default(); + let schema_builder = schema::Schema::builder(); let index = Index::create_in_ram(schema_builder.build()); let _index_writer = index.writer(40_000_000).unwrap(); match index.writer(40_000_000) { @@ -671,7 +669,7 @@ mod tests { #[test] fn test_lockfile_already_exists_error_msg() { - let schema_builder = schema::SchemaBuilder::default(); + let schema_builder = schema::Schema::builder(); let index = Index::create_in_ram(schema_builder.build()); let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); match index.writer_with_num_threads(1, 3_000_000) { @@ -686,7 +684,7 @@ mod tests { #[test] fn test_set_merge_policy() { - let schema_builder = schema::SchemaBuilder::default(); + let schema_builder = schema::Schema::builder(); let index = Index::create_in_ram(schema_builder.build()); let index_writer = index.writer(40_000_000).unwrap(); assert_eq!( @@ -704,7 +702,7 @@ mod tests { #[test] fn test_lockfile_released_on_drop() { - let schema_builder = schema::SchemaBuilder::default(); + let schema_builder = schema::Schema::builder(); let index = Index::create_in_ram(schema_builder.build()); { let _index_writer = index.writer(40_000_000).unwrap(); @@ -716,7 +714,7 @@ mod tests { #[test] fn test_commit_and_rollback() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let text_field = schema_builder.add_text_field("text", schema::TEXT); let index = Index::create_in_ram(schema_builder.build()); @@ -750,7 +748,7 @@ mod tests { #[test] fn test_with_merges() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let text_field = schema_builder.add_text_field("text", schema::TEXT); let index = Index::create_in_ram(schema_builder.build()); let num_docs_containing = |s: &str| { @@ -787,7 +785,7 @@ mod tests { #[test] fn test_prepare_with_commit_message() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let text_field = schema_builder.add_text_field("text", schema::TEXT); let index = Index::create_in_ram(schema_builder.build()); @@ -821,7 +819,7 @@ mod tests { #[test] fn test_prepare_but_rollback() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let text_field = schema_builder.add_text_field("text", schema::TEXT); let index = Index::create_in_ram(schema_builder.build()); @@ -869,7 +867,7 @@ mod tests { #[test] fn test_write_commit_fails() { use fail; - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let text_field = schema_builder.add_text_field("text", schema::TEXT); let index = Index::create_in_ram(schema_builder.build()); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index a42ea6d44..f8f34a8d8 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -614,7 +614,7 @@ impl IndexMerger { store_writer.store(&doc)?; } } else { - store_writer.stack(store_reader)?; + store_writer.stack(&store_reader)?; } } Ok(()) @@ -635,13 +635,13 @@ impl SerializableSegment for IndexMerger { #[cfg(test)] mod tests { use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; - use collector::chain; use collector::tests::TestCollector; use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector}; - use collector::FacetCollector; + use collector::{Count, FacetCollector}; use core::Index; use futures::Future; use query::AllQuery; + use schema::Facet; use query::BooleanQuery; use query::TermQuery; use schema; @@ -658,7 +658,7 @@ mod tests { #[test] fn test_index_merger_no_deletes() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let text_fieldtype = schema::TextOptions::default() .set_indexing_options( TextFieldIndexing::default() @@ -742,28 +742,30 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let get_doc_ids = |terms: Vec| { - let mut collector = TestCollector::default(); let query = BooleanQuery::new_multiterms_query(terms); - assert!(searcher.search(&query, &mut collector).is_ok()); - collector.docs() + let top_docs = searcher.search(&query, &TestCollector).unwrap(); + top_docs.docs().to_vec() }; { assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "a")]), - vec![1, 2, 4] + vec![DocAddress(0,1), DocAddress(0,2), DocAddress(0,4)] ); assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "af")]), - vec![0, 3] + vec![DocAddress(0,0), DocAddress(0,3)] ); assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "g")]), - vec![4] + vec![DocAddress(0,4)] ); assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "b")]), - vec![0, 1, 2, 3, 4] - ); + vec![DocAddress(0,0), + DocAddress(0,1), + DocAddress(0,2), + DocAddress(0,3), + DocAddress(0,4)]); } { let doc = searcher.doc(DocAddress(0, 0)).unwrap(); @@ -788,17 +790,13 @@ mod tests { { let get_fast_vals = |terms: Vec| { let query = BooleanQuery::new_multiterms_query(terms); - let mut collector = FastFieldTestCollector::for_field(score_field); - assert!(searcher.search(&query, &mut collector).is_ok()); - collector.vals() + searcher.search(&query, &FastFieldTestCollector::for_field(score_field)).unwrap() }; let get_fast_vals_bytes = |terms: Vec| { let query = BooleanQuery::new_multiterms_query(terms); - let mut collector = BytesFastFieldTestCollector::for_field(bytes_score_field); searcher - .search(&query, &mut collector) - .expect("failed to search"); - collector.vals() + .search(&query, &BytesFastFieldTestCollector::for_field(bytes_score_field)) + .expect("failed to search") }; assert_eq!( get_fast_vals(vec![Term::from_field_text(text_field, "a")]), @@ -812,9 +810,10 @@ mod tests { } } + #[test] fn test_index_merger_with_deletes() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let text_fieldtype = schema::TextOptions::default() .set_indexing_options( TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), @@ -827,21 +826,14 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let search_term = |searcher: &Searcher, term: Term| { - let mut collector = FastFieldTestCollector::for_field(score_field); - let mut bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field); + let collector = FastFieldTestCollector::for_field(score_field); + let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field); let term_query = TermQuery::new(term, IndexRecordOption::Basic); - - { - let mut combined_collector = - chain().push(&mut collector).push(&mut bytes_collector); + let (scores, bytes) = searcher - .search(&term_query, &mut combined_collector) + .search(&term_query, &(collector, bytes_collector)) .unwrap(); - } - - let scores = collector.vals(); - - let mut score_bytes = Cursor::new(bytes_collector.vals()); + let mut score_bytes = Cursor::new(bytes); for &score in &scores { assert_eq!(score as u32, score_bytes.read_u32::().unwrap()); } @@ -922,10 +914,10 @@ mod tests { assert_eq!(searcher.segment_readers().len(), 2); assert_eq!(searcher.num_docs(), 3); - assert_eq!(searcher.segment_readers()[0].num_docs(), 1); - assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(searcher.segment_readers()[1].num_docs(), 2); - assert_eq!(searcher.segment_readers()[1].max_doc(), 4); + assert_eq!(searcher.segment_readers()[0].num_docs(), 2); + assert_eq!(searcher.segment_readers()[0].max_doc(), 4); + assert_eq!(searcher.segment_readers()[1].num_docs(), 1); + assert_eq!(searcher.segment_readers()[1].max_doc(), 3); assert_eq!( search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec @@ -959,15 +951,15 @@ mod tests { .segment_reader(0) .fast_field_reader::(score_field) .unwrap(); - assert_eq!(score_field_reader.min_value(), 1); - assert_eq!(score_field_reader.max_value(), 3); + assert_eq!(score_field_reader.min_value(), 4000); + assert_eq!(score_field_reader.max_value(), 7000); let score_field_reader = searcher .segment_reader(1) .fast_field_reader::(score_field) .unwrap(); - assert_eq!(score_field_reader.min_value(), 4000); - assert_eq!(score_field_reader.max_value(), 7000); + assert_eq!(score_field_reader.min_value(), 1); + assert_eq!(score_field_reader.max_value(), 3); } { // merging the segments @@ -1140,10 +1132,9 @@ mod tests { #[test] fn test_merge_facets() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let facet_field = schema_builder.add_facet_field("facet"); let index = Index::create_in_ram(schema_builder.build()); - use schema::Facet; { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| { @@ -1172,20 +1163,14 @@ mod tests { index_doc(&mut index_writer, &["/top/e", "/top/f"]); index_writer.commit().expect("committed"); } + index.load_searchers().unwrap(); let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| { let searcher = index.searcher(); let mut facet_collector = FacetCollector::for_field(facet_field); facet_collector.add_facet(Facet::from("/top")); - use collector::{CountCollector, MultiCollector}; - let mut count_collector = CountCollector::default(); - { - let mut multi_collectors = - MultiCollector::from(vec![&mut count_collector, &mut facet_collector]); - searcher.search(&AllQuery, &mut multi_collectors).unwrap(); - } - assert_eq!(count_collector.count(), expected_num_docs); - let facet_counts = facet_collector.harvest(); + let (count, facet_counts) = searcher.search(&AllQuery, &(Count, facet_collector)).unwrap(); + assert_eq!(count, expected_num_docs); let facets: Vec<(String, u64)> = facet_counts .get("/top") .map(|(facet, count)| (facet.to_string(), count)) @@ -1209,7 +1194,6 @@ mod tests { ("/top/f", 1), ], ); - // Merging the segments { let segment_ids = index @@ -1222,7 +1206,6 @@ mod tests { .wait() .expect("Merging failed"); index_writer.wait_merging_threads().unwrap(); - index.load_searchers().unwrap(); test_searcher( 11, @@ -1245,23 +1228,19 @@ mod tests { index_writer.delete_term(facet_term); index_writer.commit().unwrap(); index.load_searchers().unwrap(); - test_searcher( - 9, - &[ + test_searcher(9, &[ ("/top/a", 3), ("/top/b", 3), ("/top/c", 1), ("/top/d", 2), ("/top/e", 2), - ("/top/f", 1), - ], - ); + ("/top/f", 1)]); } } #[test] fn test_merge_multivalued_int_fields_all_deleted() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let int_options = IntOptions::default() .set_fast(Cardinality::MultiValues) .set_indexed(); @@ -1302,7 +1281,7 @@ mod tests { #[test] fn test_merge_multivalued_int_fields() { - let mut schema_builder = schema::SchemaBuilder::default(); + let mut schema_builder = schema::Schema::builder(); let int_options = IntOptions::default() .set_fast(Cardinality::MultiValues) .set_indexed(); @@ -1368,15 +1347,10 @@ mod tests { assert_eq!(&vals, &[17]); } - { - let segment = searcher.segment_reader(1u32); - let ff_reader = segment.multi_fast_field_reader(int_field).unwrap(); - ff_reader.get_vals(0, &mut vals); - assert_eq!(&vals, &[20]); - } + println!("{:?}", searcher.segment_readers().iter().map(|reader| reader.max_doc()).collect::>()); { - let segment = searcher.segment_reader(2u32); + let segment = searcher.segment_reader(1u32); let ff_reader = segment.multi_fast_field_reader(int_field).unwrap(); ff_reader.get_vals(0, &mut vals); assert_eq!(&vals, &[28, 27]); @@ -1385,6 +1359,13 @@ mod tests { assert_eq!(&vals, &[1_000]); } + { + let segment = searcher.segment_reader(2u32); + let ff_reader = segment.multi_fast_field_reader(int_field).unwrap(); + ff_reader.get_vals(0, &mut vals); + assert_eq!(&vals, &[20]); + } + // Merging the segments { let segment_ids = index @@ -1403,6 +1384,7 @@ mod tests { { let searcher = index.searcher(); + println!("{:?}", searcher.segment_readers().iter().map(|reader| reader.max_doc()).collect::>()); let segment = searcher.segment_reader(0u32); let ff_reader = segment.multi_fast_field_reader(int_field).unwrap(); @@ -1427,14 +1409,16 @@ mod tests { ff_reader.get_vals(6, &mut vals); assert_eq!(&vals, &[17]); - ff_reader.get_vals(7, &mut vals); - assert_eq!(&vals, &[20]); - ff_reader.get_vals(8, &mut vals); + ff_reader.get_vals(7, &mut vals); assert_eq!(&vals, &[28, 27]); - ff_reader.get_vals(9, &mut vals); + ff_reader.get_vals(8, &mut vals); assert_eq!(&vals, &[1_000]); + + ff_reader.get_vals(9, &mut vals); + assert_eq!(&vals, &[20]); + } } } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 1b2cd7c85..283657ffe 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -227,8 +227,26 @@ impl SegmentUpdater { if self.is_alive() { let index = &self.0.index; let directory = index.directory(); + let mut commited_segment_metas = self.0.segment_manager.committed_segment_metas(); + + // We sort segment_readers by number of documents. + // This is an heuristic to make multithreading more efficient. + // + // This is not done at the searcher level because I had a strange + // use case in which I was dealing with a large static index, + // dispatched over 5 SSD drives. + // + // A `UnionDirectory` makes it possible to read from these + // 5 different drives and creates a meta.json on the fly. + // In order to optimize the throughput, it creates a lasagna of segments + // from the different drives. + // + // Segment 1 from disk 1, Segment 1 from disk 2, etc. + commited_segment_metas.sort_by_key(|segment_meta| { + -(segment_meta.max_doc() as i32) + }); save_metas( - self.0.segment_manager.committed_segment_metas(), + commited_segment_metas, index.schema(), opstamp, commit_message, @@ -484,7 +502,7 @@ mod tests { #[test] fn test_delete_during_merge() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); diff --git a/src/lib.rs b/src/lib.rs index 12c7a105e..dee843a5e 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,7 +24,8 @@ //! # use tempdir::TempDir; //! # use tantivy::Index; //! # use tantivy::schema::*; -//! # use tantivy::collector::TopCollector; +//! # use tantivy::{Score, DocAddress}; +//! # use tantivy::collector::TopDocs; //! # use tantivy::query::QueryParser; //! # //! # fn main() { @@ -46,7 +47,7 @@ //! // in a compressed, row-oriented key-value store. //! // This store is useful to reconstruct the //! // documents that were selected during the search phase. -//! let mut schema_builder = SchemaBuilder::default(); +//! let mut schema_builder = Schema::builder(); //! let title = schema_builder.add_text_field("title", TEXT | STORED); //! let body = schema_builder.add_text_field("body", TEXT); //! let schema = schema_builder.build(); @@ -86,13 +87,13 @@ //! // A ticket has been opened regarding this problem. //! let query = query_parser.parse_query("sea whale")?; //! -//! let mut top_collector = TopCollector::with_limit(10); -//! searcher.search(&*query, &mut top_collector)?; +//! // Perform search. +//! // `topdocs` contains the 10 most relevant doc ids, sorted by decreasing scores... +//! let top_docs: Vec<(Score, DocAddress)> = +//! searcher.search(&query, &TopDocs::with_limit(10))?; //! -//! // Our top collector now contains the 10 -//! // most relevant doc ids... -//! let doc_addresses = top_collector.docs(); -//! for doc_address in doc_addresses { +//! for (_score, doc_address) in top_docs { +//! // Retrieve the actual content of documents given its `doc_address`. //! let retrieved_doc = searcher.doc(doc_address)?; //! println!("{}", schema.to_json(&retrieved_doc)); //! } @@ -129,6 +130,7 @@ extern crate base64; extern crate bit_set; extern crate bitpacking; extern crate byteorder; +extern crate scoped_pool; extern crate combine; @@ -295,6 +297,7 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId); #[cfg(test)] mod tests { + use DocAddress; use collector::tests::TestCollector; use core::SegmentReader; use docset::DocSet; @@ -345,7 +348,7 @@ mod tests { #[test] #[cfg(feature = "mmap")] fn test_indexing() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_from_tempdir(schema).unwrap(); @@ -370,7 +373,7 @@ mod tests { #[test] fn test_docfreq1() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); @@ -410,7 +413,7 @@ mod tests { #[test] fn test_fieldnorm_no_docs_with_field() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let title_field = schema_builder.add_text_field("title", TEXT); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); @@ -439,7 +442,7 @@ mod tests { #[test] fn test_fieldnorm() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); { @@ -480,7 +483,7 @@ mod tests { #[test] fn test_delete_postings1() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let term_abcd = Term::from_field_text(text_field, "abcd"); let term_a = Term::from_field_text(text_field, "a"); @@ -491,42 +494,21 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { - // 0 - let doc = doc!(text_field=>"a b"); - index_writer.add_document(doc); - } - { - // 1 - let doc = doc!(text_field=>" a c"); - index_writer.add_document(doc); - } - { - // 2 - let doc = doc!(text_field=>" b c"); - index_writer.add_document(doc); - } - { - // 3 - let doc = doc!(text_field=>" b d"); - index_writer.add_document(doc); - } - { - index_writer.delete_term(Term::from_field_text(text_field, "c")); - } - { - index_writer.delete_term(Term::from_field_text(text_field, "a")); - } - { - // 4 - let doc = doc!(text_field=>" b c"); - index_writer.add_document(doc); - } - { - // 5 - let doc = doc!(text_field=>" a"); - index_writer.add_document(doc); - } + // 0 + index_writer.add_document(doc!(text_field=>"a b")); + // 1 + index_writer.add_document(doc!(text_field=>" a c")); + // 2 + index_writer.add_document(doc!(text_field=>" b c")); + // 3 + index_writer.add_document(doc!(text_field=>" b d")); + + index_writer.delete_term(Term::from_field_text(text_field, "c")); + index_writer.delete_term(Term::from_field_text(text_field, "a")); + // 4 + index_writer.add_document(doc!(text_field=>" b c")); + // 5 + index_writer.add_document(doc!(text_field=>" a")); index_writer.commit().unwrap(); } { @@ -561,15 +543,10 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { - // 0 - let doc = doc!(text_field=>"a b"); - index_writer.add_document(doc); - } - { - // 1 - index_writer.delete_term(Term::from_field_text(text_field, "c")); - } + // 0 + index_writer.add_document(doc!(text_field=>"a b")); + // 1 + index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.rollback().unwrap(); } { @@ -605,13 +582,8 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { - let doc = doc!(text_field=>"a b"); - index_writer.add_document(doc); - } - { - index_writer.delete_term(Term::from_field_text(text_field, "c")); - } + index_writer.add_document(doc!(text_field=>"a b")); + index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.rollback().unwrap(); index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.commit().unwrap(); @@ -655,7 +627,7 @@ mod tests { #[test] fn test_indexed_u64() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let field = schema_builder.add_u64_field("value", INT_INDEXED); let schema = schema_builder.build(); @@ -678,7 +650,7 @@ mod tests { #[test] fn test_indexed_i64() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let value_field = schema_builder.add_i64_field("value", INT_INDEXED); let schema = schema_builder.build(); @@ -702,7 +674,7 @@ mod tests { #[test] fn test_indexedfield_not_in_documents() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let absent_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); @@ -718,7 +690,7 @@ mod tests { #[test] fn test_delete_postings2() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -754,7 +726,7 @@ mod tests { #[test] fn test_termfreq() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -791,7 +763,7 @@ mod tests { #[test] fn test_searcher_1() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -799,18 +771,9 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { - let doc = doc!(text_field=>"af af af b"); - index_writer.add_document(doc); - } - { - let doc = doc!(text_field=>"a b c"); - index_writer.add_document(doc); - } - { - let doc = doc!(text_field=>"a b c d"); - index_writer.add_document(doc); - } + index_writer.add_document(doc!(text_field=>"af af af b")); + index_writer.add_document(doc!(text_field=>"a b c")); + index_writer.add_document(doc!(text_field=>"a b c d")); index_writer.commit().unwrap(); } { @@ -818,55 +781,43 @@ mod tests { let searcher = index.searcher(); let get_doc_ids = |terms: Vec| { let query = BooleanQuery::new_multiterms_query(terms); - let mut collector = TestCollector::default(); - assert!(searcher.search(&query, &mut collector).is_ok()); - collector.docs() + let topdocs = searcher.search(&query, &TestCollector).unwrap(); + topdocs.docs().to_vec() }; - { - assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "a")]), - vec![1, 2] - ); - } - { - assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "af")]), - vec![0] - ); - } - { - assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "b")]), - vec![0, 1, 2] - ); - } - { - assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "c")]), - vec![1, 2] - ); - } - { - assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "d")]), - vec![2] - ); - } - { - assert_eq!( - get_doc_ids(vec![ - Term::from_field_text(text_field, "b"), - Term::from_field_text(text_field, "a"), - ]), - vec![0, 1, 2] - ); - } + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "a")]), + vec![DocAddress(0, 1), DocAddress(0, 2)] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "af")]), + vec![DocAddress(0, 0)] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "b")]), + vec![DocAddress(0,0), DocAddress(0,1), DocAddress(0,2)] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "c")]), + vec![DocAddress(0,1), DocAddress(0,2)] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "d")]), + vec![DocAddress(0,2)] + ); + assert_eq!( + get_doc_ids(vec![ + Term::from_field_text(text_field, "b"), + Term::from_field_text(text_field, "a"), + ]), + vec![DocAddress(0,0), DocAddress(0,1), DocAddress(0,2)] + ); + } } #[test] fn test_searcher_2() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -893,7 +844,7 @@ mod tests { #[test] fn test_doc_macro() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let other_text_field = schema_builder.add_text_field("text2", TEXT); let document = doc!(text_field => "tantivy", @@ -911,7 +862,7 @@ mod tests { #[test] fn test_wrong_fast_field_type() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST); let fast_field_signed = schema_builder.add_i64_field("signed", FAST); let text_field = schema_builder.add_text_field("text", TEXT); diff --git a/src/macros.rs b/src/macros.rs index 87d4d926e..baa9dff4a 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -26,12 +26,12 @@ /// #[macro_use] /// extern crate tantivy; /// -/// use tantivy::schema::{SchemaBuilder, TEXT, FAST}; +/// use tantivy::schema::{Schema, TEXT, FAST}; /// /// //... /// /// # fn main() { -/// let mut schema_builder = SchemaBuilder::new(); +/// let mut schema_builder = Schema::builder(); /// let title = schema_builder.add_text_field("title", TEXT); /// let author = schema_builder.add_text_field("text", TEXT); /// let likes = schema_builder.add_u64_field("num_u64", FAST); @@ -67,11 +67,11 @@ macro_rules! doc( #[cfg(test)] mod test { - use schema::{SchemaBuilder, FAST, TEXT}; + use schema::{Schema, FAST, TEXT}; #[test] fn test_doc_basic() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field("title", TEXT); let author = schema_builder.add_text_field("text", TEXT); let likes = schema_builder.add_u64_field("num_u64", FAST); @@ -85,7 +85,7 @@ mod test { #[test] fn test_doc_trailing_comma() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field("title", TEXT); let author = schema_builder.add_text_field("text", TEXT); let likes = schema_builder.add_u64_field("num_u64", FAST); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 124ffaaec..70484668a 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -58,14 +58,14 @@ pub mod tests { use rand::rngs::StdRng; use schema::Field; use schema::IndexRecordOption; - use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT}; + use schema::{Document, Schema, Term, INT_INDEXED, STRING, TEXT}; use std::iter; use DocId; use Score; #[test] pub fn test_position_write() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -89,7 +89,7 @@ pub mod tests { #[test] pub fn test_skip_positions() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field("title", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -164,7 +164,7 @@ pub mod tests { #[test] pub fn test_position_and_fieldnorm1() { let mut positions = Vec::new(); - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); @@ -277,7 +277,7 @@ pub mod tests { #[test] pub fn test_position_and_fieldnorm2() { let mut positions: Vec = Vec::new(); - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -318,7 +318,7 @@ pub mod tests { let num_docs = 300u32; let index = { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let value_field = schema_builder.add_u64_field("value", INT_INDEXED); let schema = schema_builder.build(); @@ -499,7 +499,7 @@ pub mod tests { Term::from_field_text(field, "d") }; pub static ref INDEX: Index = { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", STRING); let schema = schema_builder.build(); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 776844f2a..cd7b96e02 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -630,7 +630,7 @@ mod tests { use docset::DocSet; use fst::Streamer; use schema::IndexRecordOption; - use schema::SchemaBuilder; + use schema::Schema; use schema::Term; use schema::INT_INDEXED; use DocId; @@ -707,7 +707,7 @@ mod tests { } fn build_block_postings(docs: Vec) -> BlockSegmentPostings { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let int_field = schema_builder.add_u64_field("id", INT_INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -778,7 +778,7 @@ mod tests { #[test] fn test_reset_block_segment_postings() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let int_field = schema_builder.add_u64_field("id", INT_INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 4f5490ab1..e6468e2d7 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -86,12 +86,12 @@ mod tests { use super::AllQuery; use query::Query; - use schema::{SchemaBuilder, TEXT}; + use schema::{Schema, TEXT}; use Index; #[test] fn test_all_query() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index b38e6592d..3c1cf070e 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -11,7 +11,7 @@ use Result; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight where - A: Automaton, + A: Automaton + Send + Sync + 'static, { field: Field, automaton: A, @@ -19,7 +19,7 @@ where impl AutomatonWeight where - A: Automaton, + A: Automaton + Send + Sync + 'static { /// Create a new AutomationWeight pub fn new(field: Field, automaton: A) -> AutomatonWeight { @@ -33,8 +33,7 @@ where } impl Weight for AutomatonWeight -where - A: Automaton, +where A: Automaton + Send + Sync + 'static { fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 4276720ee..b3f47ed25 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -20,9 +20,10 @@ mod tests { use query::TermQuery; use schema::*; use Index; + use DocId; fn aux_test_helper() -> (Index, Field) { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -130,9 +131,12 @@ mod tests { let matching_docs = |boolean_query: &Query| { let searcher = index.searcher(); - let mut test_collector = TestCollector::default(); - searcher.search(boolean_query, &mut test_collector).unwrap(); - test_collector.docs() + let test_docs = searcher.search(boolean_query, &TestCollector).unwrap(); + test_docs.docs() + .iter() + .cloned() + .map(|doc| doc.1) + .collect::>() }; { @@ -186,9 +190,8 @@ mod tests { let score_docs = |boolean_query: &Query| { let searcher = index.searcher(); - let mut test_collector = TestCollector::default(); - searcher.search(boolean_query, &mut test_collector).unwrap(); - test_collector.scores() + let fruit = searcher.search(boolean_query, &TestCollector).unwrap(); + fruit.scores().to_vec() }; { diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index 5253fa80c..1d62ab9c0 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -25,14 +25,14 @@ lazy_static! { /// ```rust /// #[macro_use] /// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT}; +/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::{Index, Result, Term}; -/// use tantivy::collector::{CountCollector, TopCollector, chain}; +/// use tantivy::collector::{Count, TopDocs}; /// use tantivy::query::FuzzyTermQuery; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); +/// let mut schema_builder = Schema::builder(); /// let title = schema_builder.add_text_field("title", TEXT); /// let schema = schema_builder.build(); /// let index = Index::create_in_ram(schema); @@ -57,16 +57,12 @@ lazy_static! { /// let searcher = index.searcher(); /// /// { -/// let mut top_collector = TopCollector::with_limit(2); -/// let mut count_collector = CountCollector::default(); -/// { -/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector); -/// let term = Term::from_field_text(title, "Diary"); -/// let query = FuzzyTermQuery::new(term, 1, true); -/// searcher.search(&query, &mut collectors).unwrap(); -/// } -/// assert_eq!(count_collector.count(), 2); -/// assert!(top_collector.at_capacity()); +/// +/// let term = Term::from_field_text(title, "Diary"); +/// let query = FuzzyTermQuery::new(term, 1, true); +/// let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap(); +/// assert_eq!(count, 2); +/// assert_eq!(top_docs.len(), 2); /// } /// /// Ok(()) @@ -122,8 +118,8 @@ impl Query for FuzzyTermQuery { #[cfg(test)] mod test { use super::FuzzyTermQuery; - use collector::TopCollector; - use schema::SchemaBuilder; + use collector::TopDocs; + use schema::Schema; use schema::TEXT; use tests::assert_nearly_equals; use Index; @@ -131,7 +127,7 @@ mod test { #[test] pub fn test_fuzzy_term() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let country_field = schema_builder.add_text_field("country", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -148,14 +144,12 @@ mod test { index.load_searchers().unwrap(); let searcher = index.searcher(); { - let mut collector = TopCollector::with_limit(2); let term = Term::from_field_text(country_field, "japon"); let fuzzy_query = FuzzyTermQuery::new(term, 1, true); - searcher.search(&fuzzy_query, &mut collector).unwrap(); - let scored_docs = collector.top_docs(); - assert_eq!(scored_docs.len(), 1, "Expected only 1 document"); - let (score, _) = scored_docs[0]; + let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2)).unwrap(); + assert_eq!(top_docs.len(), 1, "Expected only 1 document"); + let (score, _) = top_docs[0]; assert_nearly_equals(1f32, score); } } diff --git a/src/query/mod.rs b/src/query/mod.rs index b7136c232..394aa4e2c 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -57,14 +57,14 @@ pub use self::weight::Weight; #[cfg(test)] mod tests { use Index; - use schema::{SchemaBuilder, TEXT}; + use schema::{Schema, TEXT}; use query::QueryParser; use Term; use std::collections::BTreeSet; #[test] fn test_query_terms() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 303301b0d..d374e5371 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -13,11 +13,13 @@ mod tests { use collector::tests::TestCollector; use core::Index; use error::TantivyError; - use schema::{SchemaBuilder, Term, TEXT}; + use schema::{Schema, Term, TEXT}; use tests::assert_nearly_equals; + use DocId; + use DocAddress; fn create_index(texts: &[&'static str]) -> Index { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -47,16 +49,18 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let test_query = |texts: Vec<&str>| { - let mut test_collector = TestCollector::default(); let terms: Vec = texts .iter() .map(|text| Term::from_field_text(text_field, text)) .collect(); let phrase_query = PhraseQuery::new(terms); - searcher - .search(&phrase_query, &mut test_collector) + let test_fruits = searcher + .search(&phrase_query, &TestCollector) .expect("search should succeed"); - test_collector.docs() + test_fruits.docs() + .iter() + .map(|docaddr| docaddr.1) + .collect::>() }; assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]); assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]); @@ -67,7 +71,7 @@ mod tests { #[test] pub fn test_phrase_query_no_positions() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); use schema::IndexRecordOption; use schema::TextFieldIndexing; use schema::TextOptions; @@ -91,9 +95,9 @@ mod tests { Term::from_field_text(text_field, "a"), Term::from_field_text(text_field, "b"), ]); - let mut test_collector = TestCollector::default(); if let TantivyError::SchemaError(ref msg) = searcher - .search(&phrase_query, &mut test_collector) + .search(&phrase_query, &TestCollector) + .map(|_| ()) .unwrap_err() { assert_eq!( @@ -113,16 +117,15 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let test_query = |texts: Vec<&str>| { - let mut test_collector = TestCollector::default(); let terms: Vec = texts .iter() .map(|text| Term::from_field_text(text_field, text)) .collect(); let phrase_query = PhraseQuery::new(terms); searcher - .search(&phrase_query, &mut test_collector) - .expect("search should succeed"); - test_collector.scores() + .search(&phrase_query, &TestCollector) + .expect("search should succeed") + .scores().to_vec() }; let scores = test_query(vec!["a", "b"]); assert_nearly_equals(scores[0], 0.40618482); @@ -131,51 +134,39 @@ mod tests { #[test] // motivated by #234 pub fn test_phrase_query_docfreq_order() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { - // 0 - let doc = doc!(text_field=>"b"); - index_writer.add_document(doc); - } - { - // 1 - let doc = doc!(text_field=>"a b"); - index_writer.add_document(doc); - } - { - // 2 - let doc = doc!(text_field=>"b a"); - index_writer.add_document(doc); - } + index_writer.add_document(doc!(text_field=>"b")); + index_writer.add_document(doc!(text_field=>"a b")); + index_writer.add_document(doc!(text_field=>"b a")); assert!(index_writer.commit().is_ok()); } index.load_searchers().unwrap(); let searcher = index.searcher(); let test_query = |texts: Vec<&str>| { - let mut test_collector = TestCollector::default(); let terms: Vec = texts .iter() .map(|text| Term::from_field_text(text_field, text)) .collect(); let phrase_query = PhraseQuery::new(terms); searcher - .search(&phrase_query, &mut test_collector) - .expect("search should succeed"); - test_collector.docs() + .search(&phrase_query, &TestCollector) + .expect("search should succeed") + .docs() + .to_vec() }; - assert_eq!(test_query(vec!["a", "b"]), vec![1]); - assert_eq!(test_query(vec!["b", "a"]), vec![2]); + assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress(0,1)]); + assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress(0,2)]); } #[test] // motivated by #234 pub fn test_phrase_query_non_trivial_offsets() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -187,16 +178,18 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let test_query = |texts: Vec<(usize, &str)>| { - let mut test_collector = TestCollector::default(); let terms: Vec<(usize, Term)> = texts .iter() .map(|(offset, text)| (*offset, Term::from_field_text(text_field, text))) .collect(); let phrase_query = PhraseQuery::new_with_offset(terms); searcher - .search(&phrase_query, &mut test_collector) - .expect("search should succeed"); - test_collector.docs() + .search(&phrase_query, &TestCollector) + .expect("search should succeed") + .docs() + .iter() + .map(|doc_address| doc_address.1) + .collect::>() }; assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]); assert_eq!(test_query(vec![(1, "b"), (0, "a")]), vec![0]); diff --git a/src/query/query.rs b/src/query/query.rs index ca7de8ca6..298ce6970 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -1,11 +1,9 @@ use super::Weight; -use collector::Collector; use core::searcher::Searcher; use downcast; use std::collections::BTreeSet; use std::fmt; use Result; -use SegmentLocalId; use Term; /// The `Query` trait defines a set of documents and a scoring method @@ -63,26 +61,6 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { /// Extract all of the terms associated to the query and insert them in the /// term set given in arguments. fn query_terms(&self, _term_set: &mut BTreeSet) {} - - /// Search works as follows : - /// - /// First the weight object associated to the query is created. - /// - /// Then, the query loops over the segments and for each segment : - /// - setup the collector and informs it that the segment being processed has changed. - /// - creates a `Scorer` object associated for this segment - /// - iterate throw the matched documents and push them to the collector. - /// - fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<()> { - let scoring_enabled = collector.requires_scoring(); - let weight = self.weight(searcher, scoring_enabled)?; - for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() { - collector.set_segment(segment_ord as SegmentLocalId, segment_reader)?; - let mut scorer = weight.scorer(segment_reader)?; - scorer.collect(collector, segment_reader.delete_bitset()); - } - Ok(()) - } } pub trait QueryClone { @@ -98,6 +76,26 @@ where } } +impl Query for Box { + fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { + self.as_ref().weight(searcher, scoring_enabled) + } + + fn count(&self, searcher: &Searcher) -> Result { + self.as_ref().count(searcher) + } + + fn query_terms(&self, term_set: &mut BTreeSet>>) { + self.as_ref().query_terms(term_set); + } +} + +impl QueryClone for Box { + fn box_clone(&self) -> Box { + self.as_ref().box_clone() + } +} + #[allow(missing_docs)] mod downcast_impl { downcast!(super::Query); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 231dbd8f9..97bf3a2e8 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -485,12 +485,12 @@ mod test { use query::Query; use schema::Field; use schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; - use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT}; + use schema::{Schema, Term, INT_INDEXED, STORED, STRING, TEXT}; use tokenizer::{LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager}; use Index; fn make_query_parser() -> QueryParser { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field_indexing = TextFieldIndexing::default() .set_tokenizer("en_with_stop_words") .set_index_option(IndexRecordOption::WithFreqsAndPositions); @@ -721,7 +721,7 @@ mod test { #[test] pub fn test_unknown_tokenizer() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field_indexing = TextFieldIndexing::default() .set_tokenizer("nonexistingtokenizer") .set_index_option(IndexRecordOption::Basic); @@ -739,7 +739,7 @@ mod test { #[test] pub fn test_query_parser_no_positions() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field_indexing = TextFieldIndexing::default() .set_tokenizer("customtokenizer") .set_index_option(IndexRecordOption::Basic); diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 43da4bd8c..e5ebe896d 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -40,14 +40,13 @@ fn map_bound TTo>( /// # #[macro_use] /// # extern crate tantivy; /// # use tantivy::Index; -/// # use tantivy::schema::{SchemaBuilder, INT_INDEXED}; -/// # use tantivy::collector::CountCollector; -/// # use tantivy::query::Query; +/// # use tantivy::schema::{Schema, INT_INDEXED}; +/// # use tantivy::collector::Count; /// # use tantivy::Result; /// # use tantivy::query::RangeQuery; /// # /// # fn run() -> Result<()> { -/// # let mut schema_builder = SchemaBuilder::new(); +/// # let mut schema_builder = Schema::builder(); /// # let year_field = schema_builder.add_u64_field("year", INT_INDEXED); /// # let schema = schema_builder.build(); /// # @@ -67,10 +66,7 @@ fn map_bound TTo>( /// /// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970); /// -/// let mut count_collector = CountCollector::default(); -/// docs_in_the_sixties.search(&searcher, &mut count_collector)?; -/// -/// let num_60s_books = count_collector.count(); +/// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?; /// /// # assert_eq!(num_60s_books, 2285); /// # Ok(()) @@ -213,16 +209,12 @@ impl RangeQuery { /// Lower bound of range pub fn left_bound(&self) -> Bound { - map_bound(&self.left_bound, &|bytes| { - Term::from_field_bytes(self.field, bytes) - }) + map_bound(&self.left_bound, &|bytes| Term::from_field_bytes(self.field, bytes)) } /// Upper bound of range pub fn right_bound(&self) -> Bound { - map_bound(&self.right_bound, &|bytes| { - Term::from_field_bytes(self.field, bytes) - }) + map_bound(&self.right_bound, &|bytes| Term::from_field_bytes(self.field, bytes)) } } @@ -296,9 +288,8 @@ impl Weight for RangeWeight { mod tests { use super::RangeQuery; - use collector::CountCollector; - use query::Query; - use schema::{Document, Field, SchemaBuilder, INT_INDEXED}; + use collector::Count; + use schema::{Document, Field, Schema, INT_INDEXED}; use std::collections::Bound; use Index; use Result; @@ -306,7 +297,7 @@ mod tests { #[test] fn test_range_query_simple() { fn run() -> Result<()> { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let year_field = schema_builder.add_u64_field("year", INT_INDEXED); let schema = schema_builder.build(); @@ -327,9 +318,8 @@ mod tests { let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64); // ... or `1960..=1969` if inclusive range is enabled. - let mut count_collector = CountCollector::default(); - docs_in_the_sixties.search(&searcher, &mut count_collector)?; - assert_eq!(count_collector.count(), 2285); + let count = searcher.search(&docs_in_the_sixties, &Count)?; + assert_eq!(count, 2285); Ok(()) } @@ -340,7 +330,7 @@ mod tests { fn test_range_query() { let int_field: Field; let schema = { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); int_field = schema_builder.add_i64_field("intfield", INT_INDEXED); schema_builder.build() }; @@ -364,9 +354,7 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let count_multiples = |range_query: RangeQuery| { - let mut count_collector = CountCollector::default(); - range_query.search(&searcher, &mut count_collector).unwrap(); - count_collector.count() + searcher.search(&range_query, &Count).unwrap() }; assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 10..11)), 9); diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index dcdd9bdff..412ea56c8 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -16,14 +16,14 @@ use Searcher; /// ```rust /// #[macro_use] /// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT}; +/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::{Index, Result, Term}; -/// use tantivy::collector::{CountCollector, TopCollector, chain}; +/// use tantivy::collector::Count; /// use tantivy::query::RegexQuery; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); +/// let mut schema_builder = Schema::builder(); /// let title = schema_builder.add_text_field("title", TEXT); /// let schema = schema_builder.build(); /// let index = Index::create_in_ram(schema); @@ -47,19 +47,10 @@ use Searcher; /// index.load_searchers()?; /// let searcher = index.searcher(); /// -/// { -/// let mut top_collector = TopCollector::with_limit(2); -/// let mut count_collector = CountCollector::default(); -/// { -/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector); -/// let term = Term::from_field_text(title, "Diary"); -/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title); -/// searcher.search(&query, &mut collectors).unwrap(); -/// } -/// assert_eq!(count_collector.count(), 3); -/// assert!(top_collector.at_capacity()); -/// } -/// +/// let term = Term::from_field_text(title, "Diary"); +/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title); +/// let count = searcher.search(&query, &Count)?; +/// assert_eq!(count, 3); /// Ok(()) /// } /// ``` @@ -95,15 +86,15 @@ impl Query for RegexQuery { #[cfg(test)] mod test { use super::RegexQuery; - use collector::TopCollector; - use schema::SchemaBuilder; + use collector::TopDocs; + use schema::Schema; use schema::TEXT; use tests::assert_nearly_equals; use Index; #[test] pub fn test_regex_query() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let country_field = schema_builder.add_text_field("country", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -120,20 +111,15 @@ mod test { index.load_searchers().unwrap(); let searcher = index.searcher(); { - let mut collector = TopCollector::with_limit(2); let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field); - searcher.search(®ex_query, &mut collector).unwrap(); - let scored_docs = collector.top_docs(); + let scored_docs = searcher + .search(®ex_query, &TopDocs::with_limit(2)).unwrap(); assert_eq!(scored_docs.len(), 1, "Expected only 1 document"); let (score, _) = scored_docs[0]; assert_nearly_equals(1f32, score); } - { - let mut collector = TopCollector::with_limit(2); - let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field); - searcher.search(®ex_query, &mut collector).unwrap(); - let scored_docs = collector.top_docs(); - assert_eq!(scored_docs.len(), 0, "Expected ZERO document"); - } + let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field); + let top_docs = searcher.search(®ex_query, &TopDocs::with_limit(2)).unwrap(); + assert!(top_docs.is_empty(), "Expected ZERO document"); } } diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 2c2f0cd62..a2e40fa48 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -1,8 +1,6 @@ -use collector::Collector; use common::BitSet; use docset::{DocSet, SkipResult}; use downcast; -use fastfield::DeleteBitSet; use std::ops::DerefMut; use DocId; use Score; @@ -16,20 +14,11 @@ pub trait Scorer: downcast::Any + DocSet + 'static { /// This method will perform a bit of computation and is not cached. fn score(&mut self) -> Score; - /// Consumes the complete `DocSet` and - /// push the scored documents to the collector. - fn collect(&mut self, collector: &mut Collector, delete_bitset_opt: Option<&DeleteBitSet>) { - if let Some(delete_bitset) = delete_bitset_opt { - while self.advance() { - let doc = self.doc(); - if !delete_bitset.is_deleted(doc) { - collector.collect(doc, self.score()); - } - } - } else { - while self.advance() { - collector.collect(self.doc(), self.score()); - } + /// Iterates through all of the document matched by the DocSet + /// `DocSet` and push the scored documents to the collector. + fn for_each(&mut self, callback: &mut FnMut(DocId, Score)) { + while self.advance() { + callback(self.doc(), self.score()); } } } @@ -44,9 +33,9 @@ impl Scorer for Box { self.deref_mut().score() } - fn collect(&mut self, collector: &mut Collector, delete_bitset: Option<&DeleteBitSet>) { + fn for_each(&mut self, callback: &mut FnMut(DocId, Score)) { let scorer = self.deref_mut(); - scorer.collect(collector, delete_bitset); + scorer.for_each(callback); } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index bf5171016..e0a200eb2 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -1,4 +1,4 @@ -mod term_query; + mod term_query; mod term_scorer; mod term_weight; @@ -9,17 +9,17 @@ pub use self::term_weight::TermWeight; #[cfg(test)] mod tests { - use collector::TopCollector; use docset::DocSet; use query::{Query, QueryParser, Scorer, TermQuery}; - use schema::{IndexRecordOption, SchemaBuilder, STRING, TEXT}; + use schema::{IndexRecordOption, Schema, STRING, TEXT}; use tests::assert_nearly_equals; use Index; use Term; + use collector::TopDocs; #[test] pub fn test_term_query_no_freq() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", STRING); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -49,7 +49,7 @@ mod tests { #[test] pub fn test_term_weight() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let left_field = schema_builder.add_text_field("left", TEXT); let right_field = schema_builder.add_text_field("right", TEXT); let large_field = schema_builder.add_text_field("large", TEXT); @@ -68,37 +68,31 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); { - let mut collector = TopCollector::with_limit(2); let term = Term::from_field_text(left_field, "left2"); let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs); - searcher.search(&term_query, &mut collector).unwrap(); - let scored_docs = collector.top_docs(); - assert_eq!(scored_docs.len(), 1); - let (score, _) = scored_docs[0]; + let topdocs = searcher.search(&term_query,&TopDocs::with_limit(2)).unwrap(); + assert_eq!(topdocs.len(), 1); + let (score, _) = topdocs[0]; assert_nearly_equals(0.77802235, score); } { - let mut collector = TopCollector::with_limit(2); let term = Term::from_field_text(left_field, "left1"); let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs); - searcher.search(&term_query, &mut collector).unwrap(); - let scored_docs = collector.top_docs(); - assert_eq!(scored_docs.len(), 2); - let (score1, _) = scored_docs[0]; + let top_docs = searcher.search(&term_query, &TopDocs::with_limit(2)).unwrap(); + assert_eq!(top_docs.len(), 2); + let (score1, _) = top_docs[0]; assert_nearly_equals(0.27101856, score1); - let (score2, _) = scored_docs[1]; + let (score2, _) = top_docs[1]; assert_nearly_equals(0.13736556, score2); } { let query_parser = QueryParser::for_index(&index, vec![]); let query = query_parser.parse_query("left:left2 left:left1").unwrap(); - let mut collector = TopCollector::with_limit(2); - searcher.search(&*query, &mut collector).unwrap(); - let scored_docs = collector.top_docs(); - assert_eq!(scored_docs.len(), 2); - let (score1, _) = scored_docs[0]; + let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap(); + assert_eq!(top_docs.len(), 2); + let (score1, _) = top_docs[0]; assert_nearly_equals(0.9153879, score1); - let (score2, _) = scored_docs[1]; + let (score2, _) = top_docs[1]; assert_nearly_equals(0.27101856, score2); } } diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 267ca9ba7..8ddf42762 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -21,14 +21,14 @@ use Term; /// ```rust /// #[macro_use] /// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT, IndexRecordOption}; +/// use tantivy::schema::{Schema, TEXT, IndexRecordOption}; /// use tantivy::{Index, Result, Term}; -/// use tantivy::collector::{CountCollector, TopCollector, chain}; +/// use tantivy::collector::{Count, TopDocs}; /// use tantivy::query::TermQuery; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); +/// let mut schema_builder = Schema::builder(); /// let title = schema_builder.add_text_field("title", TEXT); /// let schema = schema_builder.build(); /// let index = Index::create_in_ram(schema); @@ -52,20 +52,12 @@ use Term; /// index.load_searchers()?; /// let searcher = index.searcher(); /// -/// { -/// let mut top_collector = TopCollector::with_limit(2); -/// let mut count_collector = CountCollector::default(); -/// { -/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector); -/// let query = TermQuery::new( -/// Term::from_field_text(title, "diary"), -/// IndexRecordOption::Basic, -/// ); -/// searcher.search(&query, &mut collectors).unwrap(); -/// } -/// assert_eq!(count_collector.count(), 2); -/// assert!(top_collector.at_capacity()); -/// } +/// let query = TermQuery::new( +/// Term::from_field_text(title, "diary"), +/// IndexRecordOption::Basic, +/// ); +/// let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap(); +/// assert_eq!(count, 2); /// /// Ok(()) /// } diff --git a/src/query/weight.rs b/src/query/weight.rs index d3d8b3520..9311d2299 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -6,7 +6,7 @@ use Result; /// for a given set of segments. /// /// See [`Query`](./trait.Query.html). -pub trait Weight { +pub trait Weight: Send + Sync + 'static { /// Returns the scorer for the given segment. /// See [`Query`](./trait.Query.html). fn scorer(&self, reader: &SegmentReader) -> Result>; diff --git a/src/schema/document.rs b/src/schema/document.rs index 7254c9660..9e04bb107 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -161,7 +161,7 @@ mod tests { #[test] fn test_doc() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("title", TEXT); let mut doc = Document::default(); doc.add_text(text_field, "My title"); diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 1570f0af0..d3ab23158 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -27,7 +27,7 @@ directory. ``` use tantivy::schema::*; -let mut schema_builder = SchemaBuilder::default(); +let mut schema_builder = Schema::builder(); let title_options = TextOptions::default() .set_stored() .set_indexing_options(TextFieldIndexing::default() @@ -44,11 +44,11 @@ We can split the problem of generating a search result page into two phases : the search results page. (`doc_ids[] -> Document[]`) In the first phase, the ability to search for documents by the given field is determined by the -[`TextIndexingOptions`](enum.TextIndexingOptions.html) of our [`TextOptions`] -(struct.TextOptions.html). +[`TextIndexingOptions`](enum.TextIndexingOptions.html) of our +[`TextOptions`](struct.TextOptions.html). -The effect of each possible setting is described more in detail [`TextIndexingOptions`] -(enum.TextIndexingOptions.html). +The effect of each possible setting is described more in detail +[`TextIndexingOptions`](enum.TextIndexingOptions.html). On the other hand setting the field as stored or not determines whether the field should be returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called. @@ -62,7 +62,7 @@ The example can be rewritten : ``` use tantivy::schema::*; -let mut schema_builder = SchemaBuilder::default(); +let mut schema_builder = Schema::builder(); schema_builder.add_text_field("title_options", TEXT | STORED); let schema = schema_builder.build(); ``` @@ -75,7 +75,7 @@ let schema = schema_builder.build(); ``` use tantivy::schema::*; -let mut schema_builder = SchemaBuilder::default(); +let mut schema_builder = Schema::builder(); let num_stars_options = IntOptions::default() .set_stored() .set_indexed(); diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 179579ef4..12112e82f 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -23,13 +23,14 @@ use std::fmt; /// ``` /// use tantivy::schema::*; /// -/// let mut schema_builder = SchemaBuilder::default(); +/// let mut schema_builder = Schema::builder(); /// let id_field = schema_builder.add_text_field("id", STRING); /// let title_field = schema_builder.add_text_field("title", TEXT); /// let body_field = schema_builder.add_text_field("body", TEXT); /// let schema = schema_builder.build(); /// /// ``` +#[derive(Default)] pub struct SchemaBuilder { fields: Vec, fields_map: HashMap, @@ -120,14 +121,6 @@ impl SchemaBuilder { } } -impl Default for SchemaBuilder { - fn default() -> SchemaBuilder { - SchemaBuilder { - fields: Vec::new(), - fields_map: HashMap::new(), - } - } -} struct InnerSchema { fields: Vec, @@ -156,7 +149,7 @@ impl Eq for InnerSchema {} /// ``` /// use tantivy::schema::*; /// -/// let mut schema_builder = SchemaBuilder::default(); +/// let mut schema_builder = Schema::builder(); /// let id_field = schema_builder.add_text_field("id", STRING); /// let title_field = schema_builder.add_text_field("title", TEXT); /// let body_field = schema_builder.add_text_field("body", TEXT); @@ -182,6 +175,11 @@ impl Schema { &self.0.fields } + /// Creates a new builder. + pub fn builder() -> SchemaBuilder { + SchemaBuilder::default() + } + /// Returns the field options associated with a given name. /// /// # Panics @@ -327,7 +325,7 @@ mod tests { #[test] pub fn is_indexed_test() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let field_str = schema_builder.add_text_field("field_str", STRING); let schema = schema_builder.build(); assert!(schema.get_field_entry(field_str).is_indexed()); @@ -335,7 +333,7 @@ mod tests { #[test] pub fn test_schema_serialization() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let count_options = IntOptions::default() .set_stored() .set_fast(Cardinality::SingleValue); @@ -404,7 +402,7 @@ mod tests { #[test] pub fn test_document_to_json() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let count_options = IntOptions::default() .set_stored() .set_fast(Cardinality::SingleValue); @@ -425,7 +423,7 @@ mod tests { #[test] pub fn test_parse_document() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let count_options = IntOptions::default() .set_stored() .set_fast(Cardinality::SingleValue); diff --git a/src/schema/term.rs b/src/schema/term.rs index 92c7d11ec..bb336ab45 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -201,7 +201,7 @@ mod tests { #[test] pub fn test_term() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); schema_builder.add_text_field("text", STRING); let title_field = schema_builder.add_text_field("title", STRING); let count_field = schema_builder.add_text_field("count", STRING); diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 45bae0618..2ae3fbb4f 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -141,7 +141,7 @@ mod tests { assert!(field_options.get_indexing_options().is_some()); } { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let field = schema.get_field("body").unwrap(); diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 2c6d3d012..4bcfc39eb 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -197,12 +197,12 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) /// # #[macro_use] /// # extern crate tantivy; /// # use tantivy::Index; -/// # use tantivy::schema::{SchemaBuilder, TEXT}; +/// # use tantivy::schema::{Schema, TEXT}; /// # use tantivy::query::QueryParser; /// use tantivy::SnippetGenerator; /// /// # fn main() -> tantivy::Result<()> { -/// # let mut schema_builder = SchemaBuilder::default(); +/// # let mut schema_builder = Schema::builder(); /// # let text_field = schema_builder.add_text_field("text", TEXT); /// # let schema = schema_builder.build(); /// # let index = Index::create_in_ram(schema); @@ -306,7 +306,7 @@ impl SnippetGenerator { mod tests { use super::{search_fragments, select_best_fragment_combination}; use query::QueryParser; - use schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions, TEXT}; + use schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; use std::collections::BTreeMap; use std::iter::Iterator; use tokenizer::{box_tokenizer, SimpleTokenizer}; @@ -498,7 +498,7 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_generator_term_score() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -537,7 +537,7 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_generator() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_options = TextOptions::default().set_indexing_options( TextFieldIndexing::default() .set_tokenizer("en_stem") diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 9ffd8b849..cf1bef206 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -292,7 +292,7 @@ impl FieldUsage { #[cfg(test)] mod test { use core::Index; - use schema::SchemaBuilder; + use schema::Schema; use schema::{FAST, INT_INDEXED, TEXT}; use schema::Field; use space_usage::ByteCount; @@ -302,7 +302,7 @@ mod test { #[test] fn test_empty() { - let schema = SchemaBuilder::new().build(); + let schema = Schema::builder().build(); let index = Index::create_in_ram(schema.clone()); index.load_searchers().unwrap(); @@ -322,7 +322,7 @@ mod test { #[test] fn test_fast_indexed() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); @@ -360,7 +360,7 @@ mod test { #[test] fn test_text() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let name = schema_builder.add_text_field("name", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); @@ -398,7 +398,7 @@ mod test { #[test] fn test_store() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let name = schema_builder.add_text_field("name", STORED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); @@ -436,7 +436,7 @@ mod test { #[test] fn test_deletes() { - let mut schema_builder = SchemaBuilder::new(); + let mut schema_builder = Schema::builder(); let name = schema_builder.add_u64_field("name", INT_INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); diff --git a/src/store/mod.rs b/src/store/mod.rs index 57930e8d8..45c966b83 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -57,11 +57,11 @@ pub mod tests { use schema::Document; use schema::FieldValue; use schema::TextOptions; - use schema::{Schema, SchemaBuilder}; + use schema::Schema; use std::path::Path; pub fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored()); let field_title = schema_builder.add_text_field("title", TextOptions::default().set_stored()); diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 54102a9f4..9bc196ea8 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -35,7 +35,7 @@ mod tests { use core::Index; use directory::{Directory, RAMDirectory, ReadOnlySource}; use postings::TermInfo; - use schema::{Document, FieldType, SchemaBuilder, TEXT}; + use schema::{Document, FieldType, Schema, TEXT}; use std::path::PathBuf; use std::str; @@ -129,7 +129,7 @@ mod tests { #[test] fn test_term_iterator() { - let mut schema_builder = SchemaBuilder::default(); + let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); { diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index de5ec9f00..8f9b5f621 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -9,7 +9,7 @@ //! use tantivy::schema::*; //! //! # fn main() { -//! let mut schema_builder = SchemaBuilder::new(); +//! let mut schema_builder = Schema::builder(); //! //! let text_options = TextOptions::default() //! .set_indexing_options( @@ -82,12 +82,12 @@ //! //! ``` //! # extern crate tantivy; -//! # use tantivy::schema::SchemaBuilder; +//! # use tantivy::schema::Schema; //! # use tantivy::tokenizer::*; //! # use tantivy::Index; //! # fn main() { //! # let custom_en_tokenizer = SimpleTokenizer; -//! # let schema = SchemaBuilder::new().build(); +//! # let schema = Schema::builder().build(); //! let index = Index::create_in_ram(schema); //! index.tokenizers() //! .register("custom_en", custom_en_tokenizer); @@ -101,12 +101,12 @@ //! //! ``` //! extern crate tantivy; -//! use tantivy::schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing}; +//! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing}; //! use tantivy::tokenizer::*; //! use tantivy::Index; //! //! # fn main() { -//! let mut schema_builder = SchemaBuilder::new(); +//! let mut schema_builder = Schema::builder(); //! let text_field_indexing = TextFieldIndexing::default() //! .set_tokenizer("custom_en") //! .set_index_option(IndexRecordOption::WithFreqsAndPositions);