From 674cae8ee269fe98451dadb30abe705834a03261 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 30 Sep 2020 17:51:11 +0900 Subject: [PATCH] Issue/822 TopDocs sorted by i64, and date fastfield (in addition to u64) (#890) * Unsatisfactory implementation. The fastfield are hit. But for performance, we want the comparison to happen on u64, and the conversion to the FastType to be done only on the selected TopK elements. For i64, the current approach might be ok. For DateTime, it is most likely catastrophic. Closes #822 * Decoupled SegmentCollector Fruit from Collector Fruit. Deferred conversion from u64 to the proper FastField type to after the overall collection. (tantivy guarantees that u64 encoding is consistent with the original ordering of the fastfield) Closes #882 --- src/collector/custom_score_top_collector.rs | 2 +- src/collector/mod.rs | 21 +- src/collector/multi_collector.rs | 6 +- src/collector/top_score_collector.rs | 331 +++++++++++++++++--- src/error.rs | 6 + src/fastfield/mod.rs | 24 +- src/schema/field_entry.rs | 3 +- 7 files changed, 340 insertions(+), 53 deletions(-) diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs index 75e122578..d8037175d 100644 --- a/src/collector/custom_score_top_collector.rs +++ b/src/collector/custom_score_top_collector.rs @@ -58,10 +58,10 @@ where segment_local_id: u32, segment_reader: &SegmentReader, ) -> crate::Result { - let segment_scorer = self.custom_scorer.segment_scorer(segment_reader)?; let segment_collector = self .collector .for_segment(segment_local_id, segment_reader)?; + let segment_scorer = self.custom_scorer.segment_scorer(segment_reader)?; Ok(CustomScoreTopSegmentCollector { segment_collector, segment_scorer, diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 0ed4db23d..b47118007 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -139,7 +139,7 @@ pub trait Collector: Sync + Send { type Fruit: Fruit; /// Type of the `SegmentCollector` associated to this collector. - type Child: SegmentCollector; + type Child: SegmentCollector; /// `set_segment` is called before beginning to enumerate /// on this segment. @@ -154,7 +154,10 @@ pub trait Collector: Sync + Send { /// Combines the fruit associated to the collection of each segments /// into one fruit. - fn merge_fruits(&self, segment_fruits: Vec) -> crate::Result; + fn merge_fruits( + &self, + segment_fruits: Vec<::Fruit>, + ) -> crate::Result; /// Created a segment collector and fn collect_segment( @@ -224,11 +227,11 @@ where fn merge_fruits( &self, - children: Vec<(Left::Fruit, Right::Fruit)>, + segment_fruits: Vec<::Fruit>, ) -> crate::Result<(Left::Fruit, Right::Fruit)> { let mut left_fruits = vec![]; let mut right_fruits = vec![]; - for (left_fruit, right_fruit) in children { + for (left_fruit, right_fruit) in segment_fruits { left_fruits.push(left_fruit); right_fruits.push(right_fruit); } @@ -282,7 +285,10 @@ where self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring() } - fn merge_fruits(&self, children: Vec) -> crate::Result { + fn merge_fruits( + &self, + children: Vec<::Fruit>, + ) -> crate::Result { let mut one_fruits = vec![]; let mut two_fruits = vec![]; let mut three_fruits = vec![]; @@ -349,7 +355,10 @@ where || self.3.requires_scoring() } - fn merge_fruits(&self, children: Vec) -> crate::Result { + fn merge_fruits( + &self, + children: Vec<::Fruit>, + ) -> crate::Result { let mut one_fruits = vec![]; let mut two_fruits = vec![]; let mut three_fruits = vec![]; diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 7db5ecafa..d3cbd525a 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -34,13 +34,13 @@ impl Collector for CollectorWrapper { fn merge_fruits( &self, - children: Vec<::Fruit>, + children: Vec<::Fruit>, ) -> crate::Result> { - let typed_fruit: Vec = children + let typed_fruit: Vec<::Fruit> = children .into_iter() .map(|untyped_fruit| { untyped_fruit - .downcast::() + .downcast::<::Fruit>() .map(|boxed_but_typed| *boxed_but_typed) .map_err(|_| { TantivyError::InvalidArgument("Failed to cast child fruit.".to_string()) diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 1c00a3543..f1af215ee 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -1,6 +1,4 @@ use super::Collector; -use crate::collector::custom_score_top_collector::CustomScoreTopCollector; -use crate::collector::top_collector::TopSegmentCollector; use crate::collector::top_collector::{ComparableDoc, TopCollector}; use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector; use crate::collector::{ @@ -14,8 +12,71 @@ use crate::DocId; use crate::Score; use crate::SegmentLocalId; use crate::SegmentReader; -use std::collections::BinaryHeap; +use crate::{collector::custom_score_top_collector::CustomScoreTopCollector, fastfield::FastValue}; +use crate::{collector::top_collector::TopSegmentCollector, TantivyError}; use std::fmt; +use std::{collections::BinaryHeap, marker::PhantomData}; + +struct FastFieldConvertCollector< + TCollector: Collector>, + TFastValue: FastValue, +> { + pub collector: TCollector, + pub field: Field, + pub fast_value: std::marker::PhantomData, +} + +impl Collector for FastFieldConvertCollector +where + TCollector: Collector>, + TFastValue: FastValue + 'static, +{ + type Fruit = Vec<(TFastValue, DocAddress)>; + + type Child = TCollector::Child; + + fn for_segment( + &self, + segment_local_id: crate::SegmentLocalId, + segment: &SegmentReader, + ) -> crate::Result { + let schema = segment.schema(); + let field_entry = schema.get_field_entry(self.field); + if !field_entry.is_fast() { + return Err(TantivyError::SchemaError(format!( + "Field {:?} is not a fast field.", + field_entry.name() + ))); + } + let schema_type = TFastValue::to_type(); + let requested_type = field_entry.field_type().value_type(); + if schema_type != requested_type { + return Err(TantivyError::SchemaError(format!( + "Field {:?} is of type {:?}!={:?}", + field_entry.name(), + schema_type, + requested_type + ))); + } + self.collector.for_segment(segment_local_id, segment) + } + + fn requires_scoring(&self) -> bool { + self.collector.requires_scoring() + } + + fn merge_fruits( + &self, + segment_fruits: Vec<::Fruit>, + ) -> crate::Result { + let raw_result = self.collector.merge_fruits(segment_fruits)?; + let transformed_result = raw_result + .into_iter() + .map(|(score, doc_address)| (TFastValue::from_u64(score), doc_address)) + .collect::>(); + Ok(transformed_result) + } +} /// The `TopDocs` collector keeps track of the top `K` documents /// sorted by their score. @@ -73,7 +134,7 @@ struct ScorerByFastFieldReader { impl CustomSegmentScorer for ScorerByFastFieldReader { fn score(&mut self, doc: DocId) -> u64 { - self.ff_reader.get_u64(u64::from(doc)) + self.ff_reader.get(doc) } } @@ -87,10 +148,10 @@ impl CustomScorer for ScorerByField { fn segment_scorer(&self, segment_reader: &SegmentReader) -> crate::Result { let ff_reader = segment_reader .fast_fields() - .u64(self.field) + .u64_lenient(self.field) .ok_or_else(|| { crate::TantivyError::SchemaError(format!( - "Field requested ({:?}) is not a i64/u64 fast field.", + "Field requested ({:?}) is not a fast field.", self.field )) })?; @@ -112,6 +173,8 @@ impl TopDocs { /// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in /// Lucene's TopDocsCollector. /// + /// # Example + /// /// ```rust /// use tantivy::collector::TopDocs; /// use tantivy::query::QueryParser; @@ -148,6 +211,14 @@ impl TopDocs { /// Set top-K to rank documents by a given fast field. /// + /// If the field is not a fast or does not exist, this method returns successfully (it is not aware of any schema). + /// An error will be returned at the moment of search. + /// + /// If the field is a FAST field but not a u64 field, search will return successfully but it will return + /// returns a monotonic u64-representation (ie. the order is still correct) of the requested field type. + /// + /// # Example + /// /// ```rust /// # use tantivy::schema::{Schema, FAST, TEXT}; /// # use tantivy::{doc, Index, DocAddress}; @@ -169,7 +240,7 @@ impl TopDocs { /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64)); /// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64)); /// # assert!(index_writer.commit().is_ok()); - /// # let reader = index.reader().unwrap(); + /// # let reader = index.reader()?; /// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?; /// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?; /// # assert_eq!(top_docs, @@ -177,25 +248,20 @@ impl TopDocs { /// # (80u64, DocAddress(0u32, 3))]); /// # Ok(()) /// # } - /// - /// /// /// Searches the document matching the given query, and /// /// collects the top 10 documents, order by the u64-`field` /// /// given in argument. - /// /// - /// /// `field` is required to be a FAST field. /// fn docs_sorted_by_rating(searcher: &Searcher, /// query: &dyn Query, - /// sort_by_field: Field) + /// rating_field: Field) /// -> tantivy::Result> { /// /// // This is where we build our topdocs collector /// // - /// // Note the generics parameter that needs to match the - /// // type `sort_by_field`. - /// let top_docs_by_rating = TopDocs + /// // Note the `rating_field` needs to be a FAST field here. + /// let top_books_by_rating = TopDocs /// ::with_limit(10) - /// .order_by_u64_field(sort_by_field); + /// .order_by_u64_field(rating_field); /// /// // ... and here are our documents. Note this is a simple vec. /// // The `u64` in the pair is the value of our fast field for @@ -205,21 +271,105 @@ impl TopDocs { /// // length of 10, or less if not enough documents matched the /// // query. /// let resulting_docs: Vec<(u64, DocAddress)> = - /// searcher.search(query, &top_docs_by_rating)?; + /// searcher.search(query, &top_books_by_rating)?; /// /// Ok(resulting_docs) /// } /// ``` /// - /// # Panics - /// - /// May panic if the field requested is not a fast field. - /// + /// # See also + /// + /// To confortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to + /// [.order_by_fast_field(...)](#method.order_by_fast_field) method. pub fn order_by_u64_field( self, field: Field, ) -> impl Collector> { - self.custom_score(ScorerByField { field }) + CustomScoreTopCollector::new(ScorerByField { field }, self.0.into_tscore()) + } + + /// Set top-K to rank documents by a given fast field. + /// + /// If the field is not a fast field, or its field type does not match the generic type, this method does not panic, + /// but an explicit error will be returned at the moment of collection. + /// + /// Note that this method is a generic. The requested fast field type will be often + /// inferred in your code by the rust compiler. + /// + /// Implementation-wise, for performance reason, tantivy will manipulate the u64 representation of your fast + /// field until the last moment. + /// + /// # Example + /// + /// ```rust + /// # use tantivy::schema::{Schema, FAST, TEXT}; + /// # use tantivy::{doc, Index, DocAddress}; + /// # use tantivy::query::{Query, AllQuery}; + /// use tantivy::Searcher; + /// use tantivy::collector::TopDocs; + /// use tantivy::schema::Field; + /// + /// # fn main() -> tantivy::Result<()> { + /// # let mut schema_builder = Schema::builder(); + /// # let title = schema_builder.add_text_field("company", TEXT); + /// # let rating = schema_builder.add_i64_field("revenue", FAST); + /// # let schema = schema_builder.build(); + /// # + /// # let index = Index::create_in_ram(schema); + /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; + /// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64)); + /// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64)); + /// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64)); + /// # assert!(index_writer.commit().is_ok()); + /// # let reader = index.reader()?; + /// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?; + /// # assert_eq!(top_docs, + /// # vec![(119_000_000i64, DocAddress(0, 1)), + /// # (92_000_000i64, DocAddress(0, 0))]); + /// # Ok(()) + /// # } + /// /// Searches the document matching the given query, and + /// /// collects the top 10 documents, order by the u64-`field` + /// /// given in argument. + /// fn docs_sorted_by_revenue(searcher: &Searcher, + /// query: &dyn Query, + /// revenue_field: Field) + /// -> tantivy::Result> { + /// + /// // This is where we build our topdocs collector + /// // + /// // Note the generics parameter that needs to match the + /// // type `sort_by_field`. revenue_field here is a FAST i64 field. + /// let top_company_by_revenue = TopDocs + /// ::with_limit(2) + /// .order_by_fast_field(revenue_field); + /// + /// // ... and here are our documents. Note this is a simple vec. + /// // The `i64` in the pair is the value of our fast field for + /// // each documents. + /// // + /// // The vec is sorted decreasingly by `sort_by_field`, and has a + /// // length of 10, or less if not enough documents matched the + /// // query. + /// let resulting_docs: Vec<(i64, DocAddress)> = + /// searcher.search(query, &top_company_by_revenue)?; + /// + /// Ok(resulting_docs) + /// } + /// ``` + pub fn order_by_fast_field( + self, + fast_field: Field, + ) -> impl Collector> + where + TFastValue: FastValue + 'static, + { + let u64_collector = self.order_by_u64_field(fast_field); + FastFieldConvertCollector { + collector: u64_collector, + field: fast_field, + fast_value: PhantomData, + } } /// Ranks the documents using a custom score. @@ -722,6 +872,94 @@ mod tests { ); } + #[test] + fn test_top_field_collector_datetime() -> crate::Result<()> { + use std::str::FromStr; + let mut schema_builder = Schema::builder(); + let name = schema_builder.add_text_field("name", TEXT); + let birthday = schema_builder.add_date_field("birthday", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + let pr_birthday = crate::DateTime::from_str("1898-04-09T00:00:00+00:00")?; + index_writer.add_document(doc!( + name => "Paul Robeson", + birthday => pr_birthday + )); + let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?; + index_writer.add_document(doc!( + name => "Minnie Riperton", + birthday => mr_birthday + )); + index_writer.commit()?; + let searcher = index.reader()?.searcher(); + let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday); + let top_docs: Vec<(crate::DateTime, DocAddress)> = + searcher.search(&AllQuery, &top_collector)?; + assert_eq!( + &top_docs[..], + &[ + (mr_birthday, DocAddress(0, 1)), + (pr_birthday, DocAddress(0, 0)), + ] + ); + Ok(()) + } + + #[test] + fn test_top_field_collector_i64() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let city = schema_builder.add_text_field("city", TEXT); + let altitude = schema_builder.add_i64_field("altitude", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!( + city => "georgetown", + altitude => -1i64, + )); + index_writer.add_document(doc!( + city => "tokyo", + altitude => 40i64, + )); + index_writer.commit()?; + let searcher = index.reader()?.searcher(); + let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude); + let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?; + assert_eq!( + &top_docs[..], + &[(40i64, DocAddress(0, 1)), (-1i64, DocAddress(0, 0)),] + ); + Ok(()) + } + + #[test] + fn test_top_field_collector_f64() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let city = schema_builder.add_text_field("city", TEXT); + let altitude = schema_builder.add_f64_field("altitude", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!( + city => "georgetown", + altitude => -1.0f64, + )); + index_writer.add_document(doc!( + city => "tokyo", + altitude => 40f64, + )); + index_writer.commit()?; + let searcher = index.reader()?.searcher(); + let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude); + let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?; + assert_eq!( + &top_docs[..], + &[(40f64, DocAddress(0, 1)), (-1.0f64, DocAddress(0, 0)),] + ); + Ok(()) + } + #[test] #[should_panic] fn test_field_does_not_exist() { @@ -744,29 +982,41 @@ mod tests { } #[test] - fn test_field_not_fast_field() { + fn test_field_not_fast_field() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let title = schema_builder.add_text_field(TITLE, TEXT); let size = schema_builder.add_u64_field(SIZE, STORED); let schema = schema_builder.build(); - let (index, _) = index("beer", title, schema, |index_writer| { - index_writer.add_document(doc!( - title => "bottle of beer", - size => 12u64, - )); - }); - let searcher = index.reader().unwrap().searcher(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(size=>1u64)); + index_writer.commit()?; + let searcher = index.reader()?.searcher(); let segment = searcher.segment_reader(0); let top_collector = TopDocs::with_limit(4).order_by_u64_field(size); - let err = top_collector.for_segment(0, segment); - if let Err(crate::TantivyError::SchemaError(msg)) = err { - assert_eq!( - msg, - "Field requested (Field(1)) is not a i64/u64 fast field." - ); - } else { - assert!(false); - } + let err = top_collector.for_segment(0, segment).err().unwrap(); + assert!( + matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field requested (Field(0)) is not a fast field.") + ); + Ok(()) + } + + #[test] + fn test_field_wrong_type() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let size = schema_builder.add_u64_field(SIZE, STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(size=>1u64)); + index_writer.commit()?; + let searcher = index.reader()?.searcher(); + let segment = searcher.segment_reader(0); + let top_collector = TopDocs::with_limit(4).order_by_fast_field::(size); + let err = top_collector.for_segment(0, segment).err().unwrap(); + assert!( + matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.") + ); + Ok(()) } #[test] @@ -820,7 +1070,6 @@ mod tests { mut doc_adder: impl FnMut(&mut IndexWriter) -> (), ) -> (Index, Box) { let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); doc_adder(&mut index_writer); index_writer.commit().unwrap(); diff --git a/src/error.rs b/src/error.rs index a90ad3d0f..9502acc44 100644 --- a/src/error.rs +++ b/src/error.rs @@ -117,6 +117,12 @@ impl From> for TantivyError { } } +impl From for TantivyError { + fn from(err: chrono::ParseError) -> TantivyError { + TantivyError::InvalidArgument(err.to_string()) + } +} + impl From for TantivyError { fn from(error: schema::DocParsingError) -> TantivyError { TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error)) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index ed64c2026..2e7808225 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -33,11 +33,14 @@ pub use self::reader::FastFieldReader; pub use self::readers::FastFieldReaders; pub use self::serializer::FastFieldSerializer; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; -use crate::chrono::{NaiveDateTime, Utc}; use crate::common; use crate::schema::Cardinality; use crate::schema::FieldType; use crate::schema::Value; +use crate::{ + chrono::{NaiveDateTime, Utc}, + schema::Type, +}; mod bytes; mod delete; @@ -76,6 +79,9 @@ pub trait FastValue: Clone + Copy + Send + Sync + PartialOrd { fn make_zero() -> Self { Self::from_u64(0i64.to_u64()) } + + /// Returns the `schema::Type` for this FastValue. + fn to_type() -> Type; } impl FastValue for u64 { @@ -98,6 +104,10 @@ impl FastValue for u64 { fn as_u64(&self) -> u64 { *self } + + fn to_type() -> Type { + Type::U64 + } } impl FastValue for i64 { @@ -119,6 +129,10 @@ impl FastValue for i64 { fn as_u64(&self) -> u64 { *self as u64 } + + fn to_type() -> Type { + Type::I64 + } } impl FastValue for f64 { @@ -140,6 +154,10 @@ impl FastValue for f64 { fn as_u64(&self) -> u64 { self.to_bits() } + + fn to_type() -> Type { + Type::F64 + } } impl FastValue for crate::DateTime { @@ -162,6 +180,10 @@ impl FastValue for crate::DateTime { fn as_u64(&self) -> u64 { self.timestamp().as_u64() } + + fn to_type() -> Type { + Type::Date + } } fn value_to_u64(value: &Value) -> u64 { diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index d2fe73f26..05d184cda 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -113,10 +113,11 @@ impl FieldEntry { } /// Returns true iff the field is a int (signed or unsigned) fast field - pub fn is_int_fast(&self) -> bool { + pub fn is_fast(&self) -> bool { match self.field_type { FieldType::U64(ref options) | FieldType::I64(ref options) + | FieldType::Date(ref options) | FieldType::F64(ref options) => options.is_fast(), _ => false, }