migrate to OptionalColumn

This commit is contained in:
Pascal Seitz
2022-11-18 12:29:49 +08:00
parent c4474fef3a
commit 191fe5101b
15 changed files with 113 additions and 85 deletions

View File

@@ -9,7 +9,7 @@
use std::sync::Arc;
use fastfield_codecs::Column;
use fastfield_codecs::OptionalColumn;
// ---
// Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector};
@@ -97,7 +97,7 @@ impl Collector for StatsCollector {
}
struct StatsSegmentCollector {
fast_field_reader: Arc<dyn Column<u64>>,
fast_field_reader: Arc<dyn OptionalColumn<u64>>,
stats: Stats,
}
@@ -105,10 +105,12 @@ impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: Score) {
let value = self.fast_field_reader.get_val(doc) as f64;
self.stats.count += 1;
self.stats.sum += value;
self.stats.squared_sum += value * value;
if let Some(value) = self.fast_field_reader.get_val(doc) {
let value = value as f64;
self.stats.count += 1;
self.stats.sum += value;
self.stats.squared_sum += value * value;
}
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit {

View File

@@ -51,7 +51,7 @@ impl Warmer for DynamicPriceColumn {
let product_id_reader = segment.fast_fields().u64(self.field)?;
let product_ids: Vec<ProductId> = segment
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc))
.flat_map(|doc| product_id_reader.get_val(doc))
.collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();

View File

@@ -103,7 +103,7 @@ mod tests {
let iter_gen = || data.iter().cloned();
serialize_u128(iter_gen, data.len() as u32, &mut out).unwrap();
let out = OwnedBytes::new(out);
open_u128::<u128>(out).unwrap()
open_u128::<u128>(out).unwrap().to_full().unwrap()
}
#[bench]

View File

@@ -731,7 +731,10 @@ mod tests {
];
let mut out = Vec::new();
serialize_u128(|| vals.iter().cloned(), vals.len() as u32, &mut out).unwrap();
let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
let decomp = open_u128::<u128>(OwnedBytes::new(out))
.unwrap()
.to_full()
.unwrap();
let complete_range = 0..vals.len() as u32;
assert_eq!(

View File

@@ -133,13 +133,16 @@ impl U128FastFieldCodecType {
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128<Item: MonotonicallyMappableToU128>(
mut bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<Item>>> {
) -> io::Result<Arc<dyn OptionalColumn<Item>>> {
let header = U128Header::deserialize(&mut bytes)?;
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
let reader = CompactSpaceDecompressor::open(bytes)?;
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
StrictlyMonotonicMappingToInternal::<Item>::new().into();
Ok(Arc::new(monotonic_map_column(reader, inverted)))
Ok(Arc::new(ToOptionalColumn::new(Arc::new(
monotonic_map_column(reader, inverted),
))))
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.

View File

@@ -113,7 +113,10 @@ fn bench_ip() {
(data.len() * 8) as f32 / dataset.len() as f32
);
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
let decompressor = open_u128::<u128>(OwnedBytes::new(data))
.unwrap()
.to_full()
.unwrap();
// Sample some ranges
let mut doc_values = Vec::new();
for value in dataset.iter().take(1110).skip(1100).cloned() {

View File

@@ -1,6 +1,6 @@
use std::sync::Arc;
use fastfield_codecs::Column;
use fastfield_codecs::OptionalColumn;
use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
@@ -160,7 +160,7 @@ pub struct FastFieldTestCollector {
pub struct FastFieldSegmentCollector {
vals: Vec<u64>,
reader: Arc<dyn Column<u64>>,
reader: Arc<dyn OptionalColumn<u64>>,
}
impl FastFieldTestCollector {
@@ -202,7 +202,9 @@ impl SegmentCollector for FastFieldSegmentCollector {
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get_val(doc);
self.vals.push(val);
if let Some(val) = val {
self.vals.push(val);
}
}
fn harvest(self) -> Vec<u64> {

View File

@@ -460,7 +460,7 @@ impl TopDocs {
///
/// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| {
/// let popularity: u64 = popularity_reader.get_val(doc);
/// let popularity: u64 = popularity_reader.get_val(doc).unwrap();
/// // Well.. For the sake of the example we use a simple logarithm
/// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -569,8 +569,8 @@ impl TopDocs {
///
/// // We can now define our actual scoring function
/// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get_val(doc);
/// let boosted: u64 = boosted_reader.get_val(doc);
/// let popularity: u64 = popularity_reader.get_val(doc).unwrap();
/// let boosted: u64 = boosted_reader.get_val(doc).unwrap();
/// // Score do not have to be `f64` in tantivy.
/// // Here we return a couple to get lexicographical order
/// // for free.

View File

@@ -151,18 +151,15 @@ impl FastFieldReaders {
/// Returns the `u64` fast field reader reader associated with `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
pub fn u64(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<u64>>> {
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
Ok(self
.typed_fast_field_reader(field)?
.to_full()
.expect("temp migration solution"))
self.typed_fast_field_reader(field)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addr(&self, field: Field) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
pub fn ip_addr(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<Ipv6Addr>>> {
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<Ipv6Addr>(bytes)?)
@@ -182,7 +179,9 @@ impl FastFieldReaders {
.expect("multivalue fast fields are always full");
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<Ipv6Addr>(bytes)?;
let vals_reader = open_u128::<Ipv6Addr>(bytes)?
.to_full()
.expect("multivalue fields are always full");
Ok(MultiValuedU128FastFieldReader::open(
idx_reader,
@@ -192,8 +191,9 @@ impl FastFieldReaders {
/// Returns the `u128` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub(crate) fn u128(&self, field: Field) -> crate::Result<Arc<dyn Column<u128>>> {
/// If `field` is not a u128 base type fast field, this method returns an Error.
/// Ip addresses use u128 as base type.
pub(crate) fn u128(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<u128>>> {
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<u128>(bytes)?)
@@ -210,7 +210,9 @@ impl FastFieldReaders {
.expect("multivalue fast fields are always full");
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<u128>(bytes)?;
let vals_reader = open_u128::<u128>(bytes)?
.to_full()
.expect("multivalue fast fields are always full");
Ok(MultiValuedU128FastFieldReader::open(
idx_reader,
@@ -246,12 +248,9 @@ impl FastFieldReaders {
/// Returns the `f64` fast field reader reader associated with `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field: Field) -> crate::Result<Arc<dyn Column<f64>>> {
pub fn f64(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<f64>>> {
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
Ok(self
.typed_fast_field_reader(field)?
.to_full()
.expect("temp migration solution"))
Ok(self.typed_fast_field_reader(field)?)
}
/// Returns the `bool` fast field reader reader associated with `field`.

View File

@@ -465,9 +465,9 @@ mod tests_indexsorting {
let my_number = index.schema().get_field("my_number").unwrap();
let fast_field = fast_fields.u64(my_number).unwrap();
assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), 30u64);
assert_eq!(fast_field.get_val(0), Some(10u64));
assert_eq!(fast_field.get_val(1), Some(20u64));
assert_eq!(fast_field.get_val(2), Some(30u64));
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
let multifield = fast_fields.u64s(multi_numbers).unwrap();

View File

@@ -1467,7 +1467,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc))
.map(|doc| fast_field_reader.get_val(doc).unwrap())
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(())
@@ -1528,7 +1528,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc))
.map(|doc| fast_field_reader.get_val(doc).unwrap())
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]);
Ok(())
@@ -1777,7 +1777,12 @@ mod tests {
.segment_readers()
.iter()
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
let ff_reader = segment_reader
.fast_fields()
.u64(id_field)
.unwrap()
.to_full()
.unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc))
@@ -1788,7 +1793,12 @@ mod tests {
.segment_readers()
.iter()
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
let ff_reader = segment_reader
.fast_fields()
.u64(id_field)
.unwrap()
.to_full()
.unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc))
@@ -1864,7 +1874,7 @@ mod tests {
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap();
segment_reader.doc_ids_alive().flat_map(move |doc| {
let val = ff_reader.get_val(doc);
let val = ff_reader.get_val(doc).unwrap(); // TODO handle null
if val == Ipv6Addr::from_u128(0) {
// TODO Fix null handling
None
@@ -1921,7 +1931,7 @@ mod tests {
ff_reader.get_vals(doc, &mut vals);
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert_eq!(id_reader.get_val(doc), vals[0]);
assert_eq!(id_reader.get_val(doc).unwrap(), vals[0]);
let mut bool_vals = vec![];
bool_ff_reader.get_vals(doc, &mut bool_vals);
@@ -2117,7 +2127,7 @@ mod tests {
facet_reader
.facet_from_ord(facet_ords[0], &mut facet)
.unwrap();
let id = ff_reader.get_val(doc_id);
let id = ff_reader.get_val(doc_id).unwrap();
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
assert_eq!(facet, facet_expected);

View File

@@ -401,10 +401,15 @@ impl IndexMerger {
.readers
.iter()
.map(|reader| {
let u128_reader: Arc<dyn Column<u128>> = reader.fast_fields().u128(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and it \
should never happen.",
);
let u128_reader: Arc<dyn Column<u128>> = reader
.fast_fields()
.u128(field)
.expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.",
)
.to_full()
.expect("temp migration solution");
u128_reader
})
.collect::<Vec<_>>();
@@ -1372,16 +1377,16 @@ mod tests {
.fast_fields()
.u64(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 4000);
assert_eq!(score_field_reader.max_value(), 7000);
assert_eq!(score_field_reader.min_value(), Some(4000));
assert_eq!(score_field_reader.max_value(), Some(7000));
let score_field_reader = searcher
.segment_reader(1)
.fast_fields()
.u64(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 1);
assert_eq!(score_field_reader.max_value(), 3);
assert_eq!(score_field_reader.min_value(), Some(1));
assert_eq!(score_field_reader.max_value(), Some(3));
}
{
// merging the segments
@@ -1426,8 +1431,8 @@ mod tests {
.fast_fields()
.u64(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000);
assert_eq!(score_field_reader.min_value(), Some(3));
assert_eq!(score_field_reader.max_value(), Some(7000));
}
{
// test a commit with only deletes
@@ -1473,8 +1478,8 @@ mod tests {
.fast_fields()
.u64(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000);
assert_eq!(score_field_reader.min_value(), Some(3));
assert_eq!(score_field_reader.max_value(), Some(7000));
}
{
// Test merging a single segment in order to remove deletes.
@@ -1520,8 +1525,8 @@ mod tests {
.fast_fields()
.u64(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 6000);
assert_eq!(score_field_reader.max_value(), 7000);
assert_eq!(score_field_reader.min_value(), Some(6000));
assert_eq!(score_field_reader.max_value(), Some(7000));
}
{

View File

@@ -186,17 +186,17 @@ mod tests {
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64(int_field).unwrap();
assert_eq!(fast_field.get_val(5), 1u64);
assert_eq!(fast_field.get_val(4), 2u64);
assert_eq!(fast_field.get_val(3), 3u64);
assert_eq!(fast_field.get_val(5), Some(1u64));
assert_eq!(fast_field.get_val(4), Some(2u64));
assert_eq!(fast_field.get_val(3), Some(3u64));
if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get_val(2), 20u64);
assert_eq!(fast_field.get_val(1), 100u64);
assert_eq!(fast_field.get_val(2), Some(20u64));
assert_eq!(fast_field.get_val(1), Some(100u64));
} else {
assert_eq!(fast_field.get_val(2), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), Some(10u64));
assert_eq!(fast_field.get_val(1), Some(20u64));
}
assert_eq!(fast_field.get_val(0), 1_000u64);
assert_eq!(fast_field.get_val(0), Some(1_000u64));
// test new field norm mapping
{
@@ -373,12 +373,12 @@ mod tests {
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64(int_field).unwrap();
assert_eq!(fast_field.get_val(0), 1u64);
assert_eq!(fast_field.get_val(1), 2u64);
assert_eq!(fast_field.get_val(2), 3u64);
assert_eq!(fast_field.get_val(3), 10u64);
assert_eq!(fast_field.get_val(4), 20u64);
assert_eq!(fast_field.get_val(5), 1_000u64);
assert_eq!(fast_field.get_val(0), Some(1u64));
assert_eq!(fast_field.get_val(1), Some(2u64));
assert_eq!(fast_field.get_val(2), Some(3u64));
assert_eq!(fast_field.get_val(3), Some(10u64));
assert_eq!(fast_field.get_val(4), Some(20u64));
assert_eq!(fast_field.get_val(5), Some(1_000u64));
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
let mut vals = vec![];

View File

@@ -1037,7 +1037,7 @@ pub mod tests {
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
assert!(fast_field_reader_opt.is_ok());
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get_val(0), 4u64)
assert_eq!(fast_field_reader.get_val(0), Some(4u64))
}
{
@@ -1051,7 +1051,7 @@ pub mod tests {
let fast_field_reader_res = segment_reader.fast_fields().f64(fast_field_float);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get_val(0), 4f64)
assert_eq!(fast_field_reader.get_val(0), Some(4f64))
}
Ok(())
}

View File

@@ -7,7 +7,7 @@ use std::ops::{Bound, RangeInclusive};
use std::sync::Arc;
use common::BinarySerializable;
use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use fastfield_codecs::{MonotonicallyMappableToU128, OptionalColumn};
use super::range_query::map_bound;
use super::{ConstScorer, Explanation, Scorer, Weight};
@@ -45,12 +45,10 @@ impl Weight for IPFastFieldRangeWeight {
match field_type.fastfield_cardinality().unwrap() {
Cardinality::SingleValue => {
let ip_addr_fast_field = reader.fast_fields().ip_addr(self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
);
let minmax = ip_addr_fast_field
.min_value()
.zip(ip_addr_fast_field.max_value());
let value_range = bound_to_value_range(&self.left_bound, &self.right_bound, minmax);
let docset = IpRangeDocSet::new(
value_range,
IpFastFieldCardinality::SingleValue(ip_addr_fast_field),
@@ -62,8 +60,10 @@ impl Weight for IPFastFieldRangeWeight {
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
Some((
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
)),
);
let docset = IpRangeDocSet::new(
value_range,
@@ -91,9 +91,10 @@ impl Weight for IPFastFieldRangeWeight {
fn bound_to_value_range(
left_bound: &Bound<Ipv6Addr>,
right_bound: &Bound<Ipv6Addr>,
min_value: Ipv6Addr,
max_value: Ipv6Addr,
min_max: Option<(Ipv6Addr, Ipv6Addr)>,
) -> RangeInclusive<Ipv6Addr> {
let (min_value, max_value) =
min_max.unwrap_or((Ipv6Addr::from(u128::MIN), Ipv6Addr::from(u128::MAX)));
let start_value = match left_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
@@ -142,7 +143,7 @@ impl VecCursor {
}
pub(crate) enum IpFastFieldCardinality {
SingleValue(Arc<dyn Column<Ipv6Addr>>),
SingleValue(Arc<dyn OptionalColumn<Ipv6Addr>>),
MultiValue(MultiValuedU128FastFieldReader<Ipv6Addr>),
}