switch to new ff api (#1868)

This commit is contained in:
PSeitz
2023-02-14 15:57:32 +08:00
committed by GitHub
parent b60b7d2afe
commit 01e5a22759
9 changed files with 140 additions and 71 deletions

View File

@@ -7,9 +7,7 @@
// Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples.
use std::sync::Arc;
use columnar::column_values::ColumnValues;
use columnar::Column;
// ---
// Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector};
@@ -97,7 +95,7 @@ impl Collector for StatsCollector {
}
struct StatsSegmentCollector {
fast_field_reader: Arc<dyn ColumnValues>,
fast_field_reader: Column,
stats: Stats,
}
@@ -105,11 +103,15 @@ impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: Score) {
let value = self.fast_field_reader.get_val(doc) as f64;
// Since we know the values are single value, we could call `first_or_default_col` on the
// column and fetch single values.
for value in self.fast_field_reader.values(doc) {
let value = value as f64;
self.stats.count += 1;
self.stats.sum += value;
self.stats.squared_sum += value * value;
}
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
self.stats.non_zero_count()

View File

@@ -48,7 +48,10 @@ impl Warmer for DynamicPriceColumn {
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
for segment in searcher.segment_readers() {
let key = (segment.segment_id(), segment.delete_opstamp());
let product_id_reader = segment.fast_fields().u64(&self.field)?;
let product_id_reader = segment
.fast_fields()
.u64(&self.field)?
.first_or_default_col(0);
let product_ids: Vec<ProductId> = segment
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc))

View File

@@ -1,6 +1,4 @@
use std::sync::Arc;
use columnar::{BytesColumn, ColumnValues};
use columnar::{BytesColumn, Column};
use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
@@ -160,7 +158,7 @@ pub struct FastFieldTestCollector {
pub struct FastFieldSegmentCollector {
vals: Vec<u64>,
reader: Arc<dyn columnar::ColumnValues>,
reader: Column,
}
impl FastFieldTestCollector {
@@ -203,8 +201,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get_val(doc);
self.vals.push(val);
self.vals.extend(self.reader.values(doc));
}
fn harvest(self) -> Vec<u64> {

View File

@@ -457,9 +457,10 @@ impl TopDocs {
/// // Typically, fast_fields.
/// //
/// // In our case, we will get a reader for the popularity
/// // fast field.
/// // fast field. For simplicity we read the first or default value in the fast
/// // field.
/// let popularity_reader =
/// segment_reader.fast_fields().u64("popularity").unwrap();
/// segment_reader.fast_fields().u64("popularity").unwrap().first_or_default_col(0);
///
/// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| {
@@ -566,9 +567,9 @@ impl TopDocs {
/// // Note that this is implemented by using a `(u64, u64)`
/// // as a score.
/// let popularity_reader =
/// segment_reader.fast_fields().u64("popularity").unwrap();
/// segment_reader.fast_fields().u64("popularity").unwrap().first_or_default_col(0);
/// let boosted_reader =
/// segment_reader.fast_fields().u64("boosted").unwrap();
/// segment_reader.fast_fields().u64("boosted").unwrap().first_or_default_col(0);
///
/// // We can now define our actual scoring function
/// move |doc: DocId| {

View File

@@ -160,7 +160,10 @@ mod tests {
assert_eq!(file.len(), 161);
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let column = fast_field_readers.u64("field").unwrap();
let column = fast_field_readers
.u64("field")
.unwrap()
.first_or_default_col(0);
assert_eq!(column.get_val(0), 13u64);
assert_eq!(column.get_val(1), 14u64);
assert_eq!(column.get_val(2), 2u64);
@@ -207,7 +210,10 @@ mod tests {
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 189);
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let col = fast_field_readers.u64("field").unwrap();
let col = fast_field_readers
.u64("field")
.unwrap()
.first_or_default_col(0);
assert_eq!(col.get_val(0), 4u64);
assert_eq!(col.get_val(1), 14_082_001u64);
assert_eq!(col.get_val(2), 3_052u64);
@@ -237,7 +243,10 @@ mod tests {
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 162);
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let fast_field_reader = fast_field_readers.u64("field").unwrap();
let fast_field_reader = fast_field_readers
.u64("field")
.unwrap()
.first_or_default_col(0);
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get_val(doc), 100_000u64);
}
@@ -267,7 +276,10 @@ mod tests {
assert_eq!(file.len(), 4557);
{
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let col = fast_field_readers.u64("field").unwrap();
let col = fast_field_readers
.u64("field")
.unwrap()
.first_or_default_col(0);
for doc in 1..10_000 {
assert_eq!(col.get_val(doc), 5_000_000_000_000_000_000u64 + doc as u64);
}
@@ -298,7 +310,10 @@ mod tests {
{
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let col = fast_field_readers.i64("field").unwrap();
let col = fast_field_readers
.i64("field")
.unwrap()
.first_or_default_col(0);
assert_eq!(col.min_value(), -100i64);
assert_eq!(col.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() {
@@ -333,7 +348,18 @@ mod tests {
let file = directory.open_read(path).unwrap();
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let col = fast_field_readers.i64("field").unwrap();
assert_eq!(col.get_val(0), 0i64);
assert_eq!(col.first(0), None);
let col = fast_field_readers
.i64("field")
.unwrap()
.first_or_default_col(0);
assert_eq!(col.get_val(0), 0);
let col = fast_field_readers
.i64("field")
.unwrap()
.first_or_default_col(-100);
assert_eq!(col.get_val(0), -100);
}
#[test]
@@ -354,7 +380,10 @@ mod tests {
let file = directory.open_read(path).unwrap();
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let col = fast_field_readers.date("date").unwrap();
let col = fast_field_readers
.date("date")
.unwrap()
.first_or_default_col(DateTime::default());
assert_eq!(col.get_val(0), DateTime::default());
}
@@ -387,7 +416,10 @@ mod tests {
}
let file = directory.open_read(path).unwrap();
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let col = fast_field_readers.u64("field").unwrap();
let col = fast_field_readers
.u64("field")
.unwrap()
.first_or_default_col(0);
for a in 0..n {
assert_eq!(col.get_val(a as u32), permutation[a]);
}
@@ -771,10 +803,10 @@ mod tests {
assert_eq!(file.len(), 175);
let fast_field_readers = FastFieldReaders::open(file).unwrap();
let bool_col = fast_field_readers.bool("field_bool").unwrap();
assert_eq!(bool_col.get_val(0), true);
assert_eq!(bool_col.get_val(1), false);
assert_eq!(bool_col.get_val(2), true);
assert_eq!(bool_col.get_val(3), false);
assert_eq!(bool_col.first(0), Some(true));
assert_eq!(bool_col.first(1), Some(false));
assert_eq!(bool_col.first(2), Some(true));
assert_eq!(bool_col.first(3), Some(false));
}
#[test]
@@ -804,8 +836,8 @@ mod tests {
let readers = FastFieldReaders::open(file).unwrap();
let bool_col = readers.bool("field_bool").unwrap();
for i in 0..25 {
assert_eq!(bool_col.get_val(i * 2), true);
assert_eq!(bool_col.get_val(i * 2 + 1), false);
assert_eq!(bool_col.first(i * 2), Some(true));
assert_eq!(bool_col.first(i * 2 + 1), Some(false));
}
}
@@ -828,7 +860,17 @@ mod tests {
assert_eq!(file.len(), 177);
let fastfield_readers = FastFieldReaders::open(file).unwrap();
let col = fastfield_readers.bool("field_bool").unwrap();
assert_eq!(col.first(0), None);
let col = fastfield_readers
.bool("field_bool")
.unwrap()
.first_or_default_col(false);
assert_eq!(col.get_val(0), false);
let col = fastfield_readers
.bool("field_bool")
.unwrap()
.first_or_default_col(true);
assert_eq!(col.get_val(0), true);
}
fn get_index(docs: &[crate::Document], schema: &Schema) -> crate::Result<RamDirectory> {
@@ -882,7 +924,7 @@ mod tests {
let col = readers.date("field").unwrap();
for (i, time) in times.iter().enumerate() {
let dt: DateTime = col.get_val(i as u32).into();
let dt: DateTime = col.first(i as u32).unwrap().into();
assert_eq!(dt, time.truncate(precision));
}
readers.column_num_bytes("field").unwrap()
@@ -918,7 +960,11 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment = &searcher.segment_readers()[0];
let field = segment.fast_fields().u64("url_norm_hash").unwrap();
let field = segment
.fast_fields()
.u64("url_norm_hash")
.unwrap()
.first_or_default_col(0);
let numbers = vec![100, 200, 300];
let test_range = |range: RangeInclusive<u64>| {
@@ -988,7 +1034,11 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment = &searcher.segment_readers()[0];
let field = segment.fast_fields().u64("url_norm_hash").unwrap();
let field = segment
.fast_fields()
.u64("url_norm_hash")
.unwrap()
.first_or_default_col(0);
let numbers = vec![1000, 1001, 1003];
let test_range = |range: RangeInclusive<u64>| {

View File

@@ -85,35 +85,45 @@ impl FastFieldReaders {
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + 'static,
DynamicColumn: Into<Option<Column<T>>>,
{
let col_opt: Option<Column<T>> = self.column_opt(field)?;
if let Some(col) = col_opt {
let col: Column<T> = self.column(field)?;
Ok(col.first_or_default_col(T::default_value()))
} else {
Err(crate::TantivyError::SchemaError(format!(
"Field `{field}` is missing or is not configured as a fast field."
)))
}
/// Returns a typed column associated to a given field name.
///
/// Returns an error if no column associated with that field_name exists.
pub fn column<T>(&self, field: &str) -> crate::Result<Column<T>>
where
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + 'static,
DynamicColumn: Into<Option<Column<T>>>,
{
let col_opt: Option<Column<T>> = self.column_opt(field)?;
col_opt.ok_or_else(|| {
crate::TantivyError::SchemaError(format!(
"Field `{field}` is missing or is not configured as a fast field."
))
})
}
/// Returns the `u64` fast field reader reader associated with `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<u64>>> {
self.column_first_or_default(field)
pub fn u64(&self, field: &str) -> crate::Result<Column<u64>> {
self.column(field)
}
/// Returns the `date` fast field reader reader associated with `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<common::DateTime>>> {
self.column_first_or_default(field)
pub fn date(&self, field: &str) -> crate::Result<Column<common::DateTime>> {
self.column(field)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addr(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<Ipv6Addr>>> {
self.column_first_or_default(field)
pub fn ip_addr(&self, field: &str) -> crate::Result<Column<Ipv6Addr>> {
self.column(field)
}
/// Returns a `str` column.
@@ -165,21 +175,21 @@ impl FastFieldReaders {
/// Returns the `i64` fast field reader reader associated with `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field_name: &str) -> crate::Result<Arc<dyn ColumnValues<i64>>> {
self.column_first_or_default(field_name)
pub fn i64(&self, field_name: &str) -> crate::Result<Column<i64>> {
self.column(field_name)
}
/// Returns the `f64` fast field reader reader associated with `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field_name: &str) -> crate::Result<Arc<dyn ColumnValues<f64>>> {
self.column_first_or_default(field_name)
pub fn f64(&self, field_name: &str) -> crate::Result<Column<f64>> {
self.column(field_name)
}
/// Returns the `bool` fast field reader reader associated with `field`.
///
/// If `field` is not a bool fast field, this method returns an Error.
pub fn bool(&self, field_name: &str) -> crate::Result<Arc<dyn ColumnValues<bool>>> {
self.column_first_or_default(field_name)
pub fn bool(&self, field_name: &str) -> crate::Result<Column<bool>> {
self.column(field_name)
}
}

View File

@@ -802,6 +802,7 @@ mod tests {
use std::net::Ipv6Addr;
use columnar::{Cardinality, Column, MonotonicallyMappableToU128};
use itertools::Itertools;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
@@ -1486,9 +1487,10 @@ mod tests {
assert_eq!(segment_reader.num_docs(), 8);
assert_eq!(segment_reader.max_doc(), 10);
let fast_field_reader = segment_reader.fast_fields().u64("id")?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc))
.flat_map(|doc| fast_field_reader.values(doc))
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(())
@@ -1548,7 +1550,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64("id")?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc))
.flat_map(|doc| fast_field_reader.values(doc))
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]);
Ok(())
@@ -1793,7 +1795,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64("id").unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc))
.flat_map(move |doc| ff_reader.values(doc).collect_vec().into_iter())
})
.collect();
@@ -1804,7 +1806,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64("id").unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc))
.flat_map(move |doc| ff_reader.values(doc).collect_vec().into_iter())
})
.collect();
@@ -1936,7 +1938,7 @@ mod tests {
let vals: Vec<u64> = ff_reader.values(doc).collect();
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert_eq!(id_reader.get_val(doc), vals[0]);
assert_eq!(id_reader.first(doc), Some(vals[0]));
let bool_vals: Vec<bool> = bool_ff_reader.values(doc).collect();
assert_eq!(bool_vals.len(), 2);
@@ -2123,7 +2125,11 @@ mod tests {
// test facets
for segment_reader in searcher.segment_readers().iter() {
let facet_reader = segment_reader.facet_reader("facet").unwrap();
let ff_reader = segment_reader.fast_fields().u64("id").unwrap();
let ff_reader = segment_reader
.fast_fields()
.u64("id")
.unwrap()
.first_or_default_col(0);
for doc_id in segment_reader.doc_ids_alive() {
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
assert_eq!(facet_ords.len(), 1);

View File

@@ -183,17 +183,17 @@ mod tests {
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64("intval").unwrap();
assert_eq!(fast_field.get_val(5), 1u64);
assert_eq!(fast_field.get_val(4), 2u64);
assert_eq!(fast_field.get_val(3), 3u64);
assert_eq!(fast_field.first(5), Some(1u64));
assert_eq!(fast_field.first(4), Some(2u64));
assert_eq!(fast_field.first(3), Some(3u64));
if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get_val(2), 20u64);
assert_eq!(fast_field.get_val(1), 100u64);
assert_eq!(fast_field.first(2), Some(20u64));
assert_eq!(fast_field.first(1), Some(100u64));
} else {
assert_eq!(fast_field.get_val(2), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.first(2), Some(10u64));
assert_eq!(fast_field.first(1), Some(20u64));
}
assert_eq!(fast_field.get_val(0), 1_000u64);
assert_eq!(fast_field.first(0), Some(1_000u64));
// test new field norm mapping
{

View File

@@ -911,21 +911,21 @@ pub mod tests {
let fast_field_reader_opt = segment_reader.fast_fields().u64("unsigned");
assert!(fast_field_reader_opt.is_ok());
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get_val(0), 4u64)
assert_eq!(fast_field_reader.first(0), Some(4u64))
}
{
let fast_field_reader_res = segment_reader.fast_fields().i64("signed");
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get_val(0), 4i64)
assert_eq!(fast_field_reader.first(0), Some(4i64))
}
{
let fast_field_reader_res = segment_reader.fast_fields().f64("float");
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get_val(0), 4f64)
assert_eq!(fast_field_reader.first(0), Some(4f64))
}
Ok(())
}