From 01e5a227592ba1bad95d246d5e8a9d5d14573c4a Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 14 Feb 2023 15:57:32 +0800 Subject: [PATCH] switch to new ff api (#1868) --- examples/custom_collector.rs | 18 +++--- examples/warmer.rs | 5 +- src/collector/tests.rs | 9 +-- src/collector/top_score_collector.rs | 9 +-- src/fastfield/mod.rs | 84 ++++++++++++++++++++----- src/fastfield/readers.rs | 46 ++++++++------ src/indexer/index_writer.rs | 18 ++++-- src/indexer/merger_sorted_index_test.rs | 16 ++--- src/lib.rs | 6 +- 9 files changed, 140 insertions(+), 71 deletions(-) diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index 4c1d7e455..60cb3dea5 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -7,9 +7,7 @@ // Of course, you can have a look at the tantivy's built-in collectors // such as the `CountCollector` for more examples. -use std::sync::Arc; - -use columnar::column_values::ColumnValues; +use columnar::Column; // --- // Importing tantivy... use tantivy::collector::{Collector, SegmentCollector}; @@ -97,7 +95,7 @@ impl Collector for StatsCollector { } struct StatsSegmentCollector { - fast_field_reader: Arc, + fast_field_reader: Column, stats: Stats, } @@ -105,10 +103,14 @@ impl SegmentCollector for StatsSegmentCollector { type Fruit = Option; fn collect(&mut self, doc: u32, _score: Score) { - let value = self.fast_field_reader.get_val(doc) as f64; - self.stats.count += 1; - self.stats.sum += value; - self.stats.squared_sum += value * value; + // Since we know the values are single value, we could call `first_or_default_col` on the + // column and fetch single values. + for value in self.fast_field_reader.values(doc) { + let value = value as f64; + self.stats.count += 1; + self.stats.sum += value; + self.stats.squared_sum += value * value; + } } fn harvest(self) -> ::Fruit { diff --git a/examples/warmer.rs b/examples/warmer.rs index 1fbdb7f6c..1cf64c028 100644 --- a/examples/warmer.rs +++ b/examples/warmer.rs @@ -48,7 +48,10 @@ impl Warmer for DynamicPriceColumn { fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> { for segment in searcher.segment_readers() { let key = (segment.segment_id(), segment.delete_opstamp()); - let product_id_reader = segment.fast_fields().u64(&self.field)?; + let product_id_reader = segment + .fast_fields() + .u64(&self.field)? + .first_or_default_col(0); let product_ids: Vec = segment .doc_ids_alive() .map(|doc| product_id_reader.get_val(doc)) diff --git a/src/collector/tests.rs b/src/collector/tests.rs index e24d34f09..5efdf2105 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -1,6 +1,4 @@ -use std::sync::Arc; - -use columnar::{BytesColumn, ColumnValues}; +use columnar::{BytesColumn, Column}; use super::*; use crate::collector::{Count, FilterCollector, TopDocs}; @@ -160,7 +158,7 @@ pub struct FastFieldTestCollector { pub struct FastFieldSegmentCollector { vals: Vec, - reader: Arc, + reader: Column, } impl FastFieldTestCollector { @@ -203,8 +201,7 @@ impl SegmentCollector for FastFieldSegmentCollector { type Fruit = Vec; fn collect(&mut self, doc: DocId, _score: Score) { - let val = self.reader.get_val(doc); - self.vals.push(val); + self.vals.extend(self.reader.values(doc)); } fn harvest(self) -> Vec { diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 243369e81..65bae73b9 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -457,9 +457,10 @@ impl TopDocs { /// // Typically, fast_fields. /// // /// // In our case, we will get a reader for the popularity - /// // fast field. + /// // fast field. For simplicity we read the first or default value in the fast + /// // field. /// let popularity_reader = - /// segment_reader.fast_fields().u64("popularity").unwrap(); + /// segment_reader.fast_fields().u64("popularity").unwrap().first_or_default_col(0); /// /// // We can now define our actual scoring function /// move |doc: DocId, original_score: Score| { @@ -566,9 +567,9 @@ impl TopDocs { /// // Note that this is implemented by using a `(u64, u64)` /// // as a score. /// let popularity_reader = - /// segment_reader.fast_fields().u64("popularity").unwrap(); + /// segment_reader.fast_fields().u64("popularity").unwrap().first_or_default_col(0); /// let boosted_reader = - /// segment_reader.fast_fields().u64("boosted").unwrap(); + /// segment_reader.fast_fields().u64("boosted").unwrap().first_or_default_col(0); /// /// // We can now define our actual scoring function /// move |doc: DocId| { diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index cd3eadc01..f35498caf 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -160,7 +160,10 @@ mod tests { assert_eq!(file.len(), 161); let fast_field_readers = FastFieldReaders::open(file).unwrap(); - let column = fast_field_readers.u64("field").unwrap(); + let column = fast_field_readers + .u64("field") + .unwrap() + .first_or_default_col(0); assert_eq!(column.get_val(0), 13u64); assert_eq!(column.get_val(1), 14u64); assert_eq!(column.get_val(2), 2u64); @@ -207,7 +210,10 @@ mod tests { let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 189); let fast_field_readers = FastFieldReaders::open(file).unwrap(); - let col = fast_field_readers.u64("field").unwrap(); + let col = fast_field_readers + .u64("field") + .unwrap() + .first_or_default_col(0); assert_eq!(col.get_val(0), 4u64); assert_eq!(col.get_val(1), 14_082_001u64); assert_eq!(col.get_val(2), 3_052u64); @@ -237,7 +243,10 @@ mod tests { let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 162); let fast_field_readers = FastFieldReaders::open(file).unwrap(); - let fast_field_reader = fast_field_readers.u64("field").unwrap(); + let fast_field_reader = fast_field_readers + .u64("field") + .unwrap() + .first_or_default_col(0); for doc in 0..10_000 { assert_eq!(fast_field_reader.get_val(doc), 100_000u64); } @@ -267,7 +276,10 @@ mod tests { assert_eq!(file.len(), 4557); { let fast_field_readers = FastFieldReaders::open(file).unwrap(); - let col = fast_field_readers.u64("field").unwrap(); + let col = fast_field_readers + .u64("field") + .unwrap() + .first_or_default_col(0); for doc in 1..10_000 { assert_eq!(col.get_val(doc), 5_000_000_000_000_000_000u64 + doc as u64); } @@ -298,7 +310,10 @@ mod tests { { let fast_field_readers = FastFieldReaders::open(file).unwrap(); - let col = fast_field_readers.i64("field").unwrap(); + let col = fast_field_readers + .i64("field") + .unwrap() + .first_or_default_col(0); assert_eq!(col.min_value(), -100i64); assert_eq!(col.max_value(), 9_999i64); for (doc, i) in (-100i64..10_000i64).enumerate() { @@ -333,7 +348,18 @@ mod tests { let file = directory.open_read(path).unwrap(); let fast_field_readers = FastFieldReaders::open(file).unwrap(); let col = fast_field_readers.i64("field").unwrap(); - assert_eq!(col.get_val(0), 0i64); + assert_eq!(col.first(0), None); + + let col = fast_field_readers + .i64("field") + .unwrap() + .first_or_default_col(0); + assert_eq!(col.get_val(0), 0); + let col = fast_field_readers + .i64("field") + .unwrap() + .first_or_default_col(-100); + assert_eq!(col.get_val(0), -100); } #[test] @@ -354,7 +380,10 @@ mod tests { let file = directory.open_read(path).unwrap(); let fast_field_readers = FastFieldReaders::open(file).unwrap(); - let col = fast_field_readers.date("date").unwrap(); + let col = fast_field_readers + .date("date") + .unwrap() + .first_or_default_col(DateTime::default()); assert_eq!(col.get_val(0), DateTime::default()); } @@ -387,7 +416,10 @@ mod tests { } let file = directory.open_read(path).unwrap(); let fast_field_readers = FastFieldReaders::open(file).unwrap(); - let col = fast_field_readers.u64("field").unwrap(); + let col = fast_field_readers + .u64("field") + .unwrap() + .first_or_default_col(0); for a in 0..n { assert_eq!(col.get_val(a as u32), permutation[a]); } @@ -771,10 +803,10 @@ mod tests { assert_eq!(file.len(), 175); let fast_field_readers = FastFieldReaders::open(file).unwrap(); let bool_col = fast_field_readers.bool("field_bool").unwrap(); - assert_eq!(bool_col.get_val(0), true); - assert_eq!(bool_col.get_val(1), false); - assert_eq!(bool_col.get_val(2), true); - assert_eq!(bool_col.get_val(3), false); + assert_eq!(bool_col.first(0), Some(true)); + assert_eq!(bool_col.first(1), Some(false)); + assert_eq!(bool_col.first(2), Some(true)); + assert_eq!(bool_col.first(3), Some(false)); } #[test] @@ -804,8 +836,8 @@ mod tests { let readers = FastFieldReaders::open(file).unwrap(); let bool_col = readers.bool("field_bool").unwrap(); for i in 0..25 { - assert_eq!(bool_col.get_val(i * 2), true); - assert_eq!(bool_col.get_val(i * 2 + 1), false); + assert_eq!(bool_col.first(i * 2), Some(true)); + assert_eq!(bool_col.first(i * 2 + 1), Some(false)); } } @@ -828,7 +860,17 @@ mod tests { assert_eq!(file.len(), 177); let fastfield_readers = FastFieldReaders::open(file).unwrap(); let col = fastfield_readers.bool("field_bool").unwrap(); + assert_eq!(col.first(0), None); + let col = fastfield_readers + .bool("field_bool") + .unwrap() + .first_or_default_col(false); assert_eq!(col.get_val(0), false); + let col = fastfield_readers + .bool("field_bool") + .unwrap() + .first_or_default_col(true); + assert_eq!(col.get_val(0), true); } fn get_index(docs: &[crate::Document], schema: &Schema) -> crate::Result { @@ -882,7 +924,7 @@ mod tests { let col = readers.date("field").unwrap(); for (i, time) in times.iter().enumerate() { - let dt: DateTime = col.get_val(i as u32).into(); + let dt: DateTime = col.first(i as u32).unwrap().into(); assert_eq!(dt, time.truncate(precision)); } readers.column_num_bytes("field").unwrap() @@ -918,7 +960,11 @@ mod tests { let reader = index.reader().unwrap(); let searcher = reader.searcher(); let segment = &searcher.segment_readers()[0]; - let field = segment.fast_fields().u64("url_norm_hash").unwrap(); + let field = segment + .fast_fields() + .u64("url_norm_hash") + .unwrap() + .first_or_default_col(0); let numbers = vec![100, 200, 300]; let test_range = |range: RangeInclusive| { @@ -988,7 +1034,11 @@ mod tests { let reader = index.reader().unwrap(); let searcher = reader.searcher(); let segment = &searcher.segment_readers()[0]; - let field = segment.fast_fields().u64("url_norm_hash").unwrap(); + let field = segment + .fast_fields() + .u64("url_norm_hash") + .unwrap() + .first_or_default_col(0); let numbers = vec![1000, 1001, 1003]; let test_range = |range: RangeInclusive| { diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 8de4406af..d0cd2fcef 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -81,39 +81,49 @@ impl FastFieldReaders { /// - Rows with no value are associated with the default value. /// - Rows with several values are associated with the first value. pub fn column_first_or_default(&self, field: &str) -> crate::Result>> + where + T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + 'static, + DynamicColumn: Into>>, + { + let col: Column = self.column(field)?; + Ok(col.first_or_default_col(T::default_value())) + } + + /// Returns a typed column associated to a given field name. + /// + /// Returns an error if no column associated with that field_name exists. + pub fn column(&self, field: &str) -> crate::Result> where T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + 'static, DynamicColumn: Into>>, { let col_opt: Option> = self.column_opt(field)?; - if let Some(col) = col_opt { - Ok(col.first_or_default_col(T::default_value())) - } else { - Err(crate::TantivyError::SchemaError(format!( + col_opt.ok_or_else(|| { + crate::TantivyError::SchemaError(format!( "Field `{field}` is missing or is not configured as a fast field." - ))) - } + )) + }) } /// Returns the `u64` fast field reader reader associated with `field`. /// /// If `field` is not a u64 fast field, this method returns an Error. - pub fn u64(&self, field: &str) -> crate::Result>> { - self.column_first_or_default(field) + pub fn u64(&self, field: &str) -> crate::Result> { + self.column(field) } /// Returns the `date` fast field reader reader associated with `field`. /// /// If `field` is not a date fast field, this method returns an Error. - pub fn date(&self, field: &str) -> crate::Result>> { - self.column_first_or_default(field) + pub fn date(&self, field: &str) -> crate::Result> { + self.column(field) } /// Returns the `ip` fast field reader reader associated to `field`. /// /// If `field` is not a u128 fast field, this method returns an Error. - pub fn ip_addr(&self, field: &str) -> crate::Result>> { - self.column_first_or_default(field) + pub fn ip_addr(&self, field: &str) -> crate::Result> { + self.column(field) } /// Returns a `str` column. @@ -165,21 +175,21 @@ impl FastFieldReaders { /// Returns the `i64` fast field reader reader associated with `field`. /// /// If `field` is not a i64 fast field, this method returns an Error. - pub fn i64(&self, field_name: &str) -> crate::Result>> { - self.column_first_or_default(field_name) + pub fn i64(&self, field_name: &str) -> crate::Result> { + self.column(field_name) } /// Returns the `f64` fast field reader reader associated with `field`. /// /// If `field` is not a f64 fast field, this method returns an Error. - pub fn f64(&self, field_name: &str) -> crate::Result>> { - self.column_first_or_default(field_name) + pub fn f64(&self, field_name: &str) -> crate::Result> { + self.column(field_name) } /// Returns the `bool` fast field reader reader associated with `field`. /// /// If `field` is not a bool fast field, this method returns an Error. - pub fn bool(&self, field_name: &str) -> crate::Result>> { - self.column_first_or_default(field_name) + pub fn bool(&self, field_name: &str) -> crate::Result> { + self.column(field_name) } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 26874838d..2a4d4f5da 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -802,6 +802,7 @@ mod tests { use std::net::Ipv6Addr; use columnar::{Cardinality, Column, MonotonicallyMappableToU128}; + use itertools::Itertools; use proptest::prop_oneof; use proptest::strategy::Strategy; @@ -1486,9 +1487,10 @@ mod tests { assert_eq!(segment_reader.num_docs(), 8); assert_eq!(segment_reader.max_doc(), 10); let fast_field_reader = segment_reader.fast_fields().u64("id")?; + let in_order_alive_ids: Vec = segment_reader .doc_ids_alive() - .map(|doc| fast_field_reader.get_val(doc)) + .flat_map(|doc| fast_field_reader.values(doc)) .collect(); assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]); Ok(()) @@ -1548,7 +1550,7 @@ mod tests { let fast_field_reader = segment_reader.fast_fields().u64("id")?; let in_order_alive_ids: Vec = segment_reader .doc_ids_alive() - .map(|doc| fast_field_reader.get_val(doc)) + .flat_map(|doc| fast_field_reader.values(doc)) .collect(); assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]); Ok(()) @@ -1793,7 +1795,7 @@ mod tests { let ff_reader = segment_reader.fast_fields().u64("id").unwrap(); segment_reader .doc_ids_alive() - .map(move |doc| ff_reader.get_val(doc)) + .flat_map(move |doc| ff_reader.values(doc).collect_vec().into_iter()) }) .collect(); @@ -1804,7 +1806,7 @@ mod tests { let ff_reader = segment_reader.fast_fields().u64("id").unwrap(); segment_reader .doc_ids_alive() - .map(move |doc| ff_reader.get_val(doc)) + .flat_map(move |doc| ff_reader.values(doc).collect_vec().into_iter()) }) .collect(); @@ -1936,7 +1938,7 @@ mod tests { let vals: Vec = ff_reader.values(doc).collect(); assert_eq!(vals.len(), 2); assert_eq!(vals[0], vals[1]); - assert_eq!(id_reader.get_val(doc), vals[0]); + assert_eq!(id_reader.first(doc), Some(vals[0])); let bool_vals: Vec = bool_ff_reader.values(doc).collect(); assert_eq!(bool_vals.len(), 2); @@ -2123,7 +2125,11 @@ mod tests { // test facets for segment_reader in searcher.segment_readers().iter() { let facet_reader = segment_reader.facet_reader("facet").unwrap(); - let ff_reader = segment_reader.fast_fields().u64("id").unwrap(); + let ff_reader = segment_reader + .fast_fields() + .u64("id") + .unwrap() + .first_or_default_col(0); for doc_id in segment_reader.doc_ids_alive() { let facet_ords: Vec = facet_reader.facet_ords(doc_id).collect(); assert_eq!(facet_ords.len(), 1); diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index ab939b52f..cf8123e2d 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -183,17 +183,17 @@ mod tests { let fast_fields = segment_reader.fast_fields(); let fast_field = fast_fields.u64("intval").unwrap(); - assert_eq!(fast_field.get_val(5), 1u64); - assert_eq!(fast_field.get_val(4), 2u64); - assert_eq!(fast_field.get_val(3), 3u64); + assert_eq!(fast_field.first(5), Some(1u64)); + assert_eq!(fast_field.first(4), Some(2u64)); + assert_eq!(fast_field.first(3), Some(3u64)); if force_disjunct_segment_sort_values { - assert_eq!(fast_field.get_val(2), 20u64); - assert_eq!(fast_field.get_val(1), 100u64); + assert_eq!(fast_field.first(2), Some(20u64)); + assert_eq!(fast_field.first(1), Some(100u64)); } else { - assert_eq!(fast_field.get_val(2), 10u64); - assert_eq!(fast_field.get_val(1), 20u64); + assert_eq!(fast_field.first(2), Some(10u64)); + assert_eq!(fast_field.first(1), Some(20u64)); } - assert_eq!(fast_field.get_val(0), 1_000u64); + assert_eq!(fast_field.first(0), Some(1_000u64)); // test new field norm mapping { diff --git a/src/lib.rs b/src/lib.rs index ed6569442..e62fe80c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -911,21 +911,21 @@ pub mod tests { let fast_field_reader_opt = segment_reader.fast_fields().u64("unsigned"); assert!(fast_field_reader_opt.is_ok()); let fast_field_reader = fast_field_reader_opt.unwrap(); - assert_eq!(fast_field_reader.get_val(0), 4u64) + assert_eq!(fast_field_reader.first(0), Some(4u64)) } { let fast_field_reader_res = segment_reader.fast_fields().i64("signed"); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); - assert_eq!(fast_field_reader.get_val(0), 4i64) + assert_eq!(fast_field_reader.first(0), Some(4i64)) } { let fast_field_reader_res = segment_reader.fast_fields().f64("float"); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); - assert_eq!(fast_field_reader.get_val(0), 4f64) + assert_eq!(fast_field_reader.first(0), Some(4f64)) } Ok(()) }