prepare for multiple fastfield codecs (#1063)

* prepare for multiple fastfield codecs

 prepare for multiple fastfield codecs by wrapping the codecs in an enum #1042

* add FastFieldSerializer trait, add DynamicFastFieldSerializer

add FastFieldSerializer trait
add DynamicFastFieldSerializer enum to wrap all implementors of the FastFieldSerializer trait

* add estimation for fastfield bitpacker
This commit is contained in:
PSeitz
2021-05-31 16:14:14 +02:00
committed by GitHub
parent 8d32c3ba3a
commit dff0ffd38a
19 changed files with 336 additions and 115 deletions

View File

@@ -10,7 +10,7 @@
// ---
// Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::FastFieldReader;
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
use tantivy::query::QueryParser;
use tantivy::schema::Field;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
@@ -98,7 +98,7 @@ impl Collector for StatsCollector {
}
struct StatsSegmentCollector {
fast_field_reader: FastFieldReader<u64>,
fast_field_reader: DynamicFastFieldReader<u64>,
stats: Stats,
}

View File

@@ -12,7 +12,7 @@
use std::marker::PhantomData;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{FastFieldReader, FastValue};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
@@ -155,7 +155,7 @@ where
TPredicate: 'static,
TPredicateValue: FastValue,
{
fast_field_reader: FastFieldReader<TPredicateValue>,
fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,

View File

@@ -1,5 +1,5 @@
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{FastFieldReader, FastValue};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
use crate::schema::{Field, Type};
use crate::{DocId, Score};
use fastdivide::DividerU64;
@@ -84,7 +84,7 @@ impl HistogramComputer {
}
pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer,
ff_reader: FastFieldReader<u64>,
ff_reader: DynamicFastFieldReader<u64>,
}
impl SegmentCollector for SegmentHistogramCollector {

View File

@@ -1,6 +1,7 @@
use super::*;
use crate::core::SegmentReader;
use crate::fastfield::BytesFastFieldReader;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldReader;
use crate::schema::Field;
use crate::DocId;
@@ -162,7 +163,7 @@ pub struct FastFieldTestCollector {
pub struct FastFieldSegmentCollector {
vals: Vec<u64>,
reader: FastFieldReader<u64>,
reader: DynamicFastFieldReader<u64>,
}
impl FastFieldTestCollector {

View File

@@ -4,7 +4,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
};
use crate::fastfield::FastFieldReader;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::query::Weight;
use crate::schema::Field;
use crate::DocAddress;
@@ -129,7 +129,7 @@ impl fmt::Debug for TopDocs {
}
struct ScorerByFastFieldReader {
ff_reader: FastFieldReader<u64>,
ff_reader: DynamicFastFieldReader<u64>,
}
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
@@ -151,7 +151,7 @@ impl CustomScorer<u64> for ScorerByField {
// mapping is monotonic, so it is sufficient to compute our top-K docs.
//
// The conversion will then happen only on the top-K docs.
let ff_reader: FastFieldReader<u64> = segment_reader
let ff_reader = segment_reader
.fast_fields()
.typed_fast_field_reader(self.field)?;
Ok(ScorerByFastFieldReader { ff_reader })
@@ -401,6 +401,7 @@ impl TopDocs {
/// # use tantivy::query::QueryParser;
/// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs;
/// use tantivy::fastfield::FastFieldReader;
/// use tantivy::schema::Field;
///
/// fn create_schema() -> Schema {
@@ -508,6 +509,7 @@ impl TopDocs {
/// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs;
/// use tantivy::schema::Field;
/// use tantivy::fastfield::FastFieldReader;
///
/// # fn create_schema() -> Schema {
/// # let mut schema_builder = Schema::builder();

View File

@@ -1,7 +1,7 @@
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::fastfield::FastFieldReader;
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, MultiValueLength};
use crate::DocId;
use crate::{directory::FileSlice, fastfield::MultiValueLength};
/// Reader for byte array fast fields
///
@@ -15,13 +15,13 @@ use crate::{directory::FileSlice, fastfield::MultiValueLength};
/// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)]
pub struct BytesFastFieldReader {
idx_reader: FastFieldReader<u64>,
idx_reader: BitpackedFastFieldReader<u64>,
values: OwnedBytes,
}
impl BytesFastFieldReader {
pub(crate) fn open(
idx_reader: FastFieldReader<u64>,
idx_reader: BitpackedFastFieldReader<u64>,
values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?;

View File

@@ -1,8 +1,11 @@
use std::io;
use crate::fastfield::serializer::FastFieldSerializer;
use crate::schema::{Document, Field, Value};
use crate::DocId;
use crate::{fastfield::serializer::FastFieldSerializer, indexer::doc_id_mapping::DocIdMapping};
use crate::{
fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping,
};
/// Writer for byte array (as in, any number of bytes per document) fast fields
///
@@ -104,7 +107,7 @@ impl BytesFastFieldWriter {
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
// writing the offset index

View File

@@ -29,8 +29,11 @@ pub use self::delete::DeleteBitSet;
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
pub use self::reader::BitpackedFastFieldReader;
pub use self::reader::DynamicFastFieldReader;
pub use self::reader::FastFieldReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::CompositeFastFieldSerializer;
pub use self::serializer::FastFieldSerializer;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::Cardinality;
@@ -211,7 +214,7 @@ mod tests {
use super::*;
use crate::common::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::FastFieldReader;
use crate::fastfield::BitpackedFastFieldReader;
use crate::merge_policy::NoMergePolicy;
use crate::schema::Field;
use crate::schema::Schema;
@@ -236,7 +239,7 @@ mod tests {
#[test]
pub fn test_fastfield() {
let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
let test_fastfield = BitpackedFastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);
@@ -254,7 +257,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
@@ -268,7 +271,7 @@ mod tests {
assert_eq!(file.len(), 36 as usize);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(file)?;
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(file)?;
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
@@ -281,7 +284,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
@@ -300,7 +303,7 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
@@ -321,7 +324,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
@@ -336,7 +339,7 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
@@ -351,7 +354,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
@@ -368,7 +371,7 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(
@@ -390,7 +393,7 @@ mod tests {
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for i in -100i64..10_000i64 {
let mut doc = Document::default();
@@ -407,7 +410,7 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
let fast_field_reader = BitpackedFastFieldReader::<i64>::open(data)?;
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
@@ -433,7 +436,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(&doc);
@@ -447,7 +450,7 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
let fast_field_reader = BitpackedFastFieldReader::<i64>::open(data)?;
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
Ok(())
@@ -468,7 +471,7 @@ mod tests {
let directory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
@@ -480,7 +483,7 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
let mut a = 0u64;
for _ in 0..n {

View File

@@ -1,6 +1,6 @@
use std::ops::Range;
use crate::fastfield::{FastFieldReader, FastValue, MultiValueLength};
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
use crate::DocId;
/// Reader for a multivalued `u64` fast field.
@@ -13,14 +13,14 @@ use crate::DocId;
///
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>,
idx_reader: BitpackedFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
}
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>,
idx_reader: BitpackedFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,

View File

@@ -1,5 +1,6 @@
use crate::fastfield::serializer::FastSingleFieldSerializer;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::serializer::DynamicFastFieldSerializer;
use crate::fastfield::serializer::FastFieldSerializer;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field};
use crate::termdict::TermOrdinal;
@@ -134,7 +135,7 @@ impl MultiValuedFastFieldWriter {
///
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
@@ -154,7 +155,7 @@ impl MultiValuedFastFieldWriter {
}
{
// writing the values themselves.
let mut value_serializer: FastSingleFieldSerializer<'_, _>;
let mut value_serializer: DynamicFastFieldSerializer<'_, _>;
match mapping_opt {
Some(mapping) => {
value_serializer = serializer.new_u64_fast_field_with_idx(

View File

@@ -4,7 +4,7 @@ use crate::common::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
@@ -14,12 +14,94 @@ use std::path::Path;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitUnpacker;
/// FastFieldReader is the trait to access fast field data.
pub trait FastFieldReader<Item: FastValue>: Clone {
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: DocId) -> Item;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]);
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn min_value(&self) -> Item;
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn max_value(&self) -> Item;
}
#[derive(Clone)]
/// DynamicFastFieldReader wraps different readers to access
/// the various encoded fastfield data
///
pub enum DynamicFastFieldReader<Item: FastValue> {
/// Bitpacked compressed fastfield data.
Bitpacked(BitpackedFastFieldReader<Item>),
}
impl<Item: FastValue> DynamicFastFieldReader<Item> {
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
Ok(DynamicFastFieldReader::Bitpacked(
BitpackedFastFieldReader::open(file)?,
))
}
}
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
fn get(&self, doc: DocId) -> Item {
match self {
Self::Bitpacked(reader) => reader.get(doc),
}
}
fn get_range(&self, start: DocId, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
}
}
fn min_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.min_value(),
}
}
fn max_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.max_value(),
}
}
}
/// Trait for accessing a fastfield.
///
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct FastFieldReader<Item: FastValue> {
pub struct BitpackedFastFieldReader<Item: FastValue> {
bytes: OwnedBytes,
bit_unpacker: BitUnpacker,
min_value_u64: u64,
@@ -27,7 +109,7 @@ pub struct FastFieldReader<Item: FastValue> {
_phantom: PhantomData<Item>,
}
impl<Item: FastValue> FastFieldReader<Item> {
impl<Item: FastValue> BitpackedFastFieldReader<Item> {
/// Opens a fast field given a file.
pub fn open(file: FileSlice) -> crate::Result<Self> {
let mut bytes = file.read_bytes()?;
@@ -36,7 +118,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(FastFieldReader {
Ok(BitpackedFastFieldReader {
bytes,
min_value_u64: min_value,
max_value_u64: max_value,
@@ -44,19 +126,6 @@ impl<Item: FastValue> FastFieldReader<Item> {
_phantom: PhantomData,
})
}
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
pub fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
}
@@ -78,6 +147,20 @@ impl<Item: FastValue> FastFieldReader<Item> {
*out = self.get_u64(start + (i as u64));
}
}
}
impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
@@ -92,7 +175,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
}
@@ -101,7 +184,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn min_value(&self) -> Item {
fn min_value(&self) -> Item {
Item::from_u64(self.min_value_u64)
}
@@ -110,13 +193,13 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self) -> Item {
fn max_value(&self) -> Item {
Item::from_u64(self.max_value_u64)
}
}
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
impl<Item: FastValue> From<Vec<Item>> for BitpackedFastFieldReader<Item> {
fn from(vals: Vec<Item>) -> BitpackedFastFieldReader<Item> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
@@ -126,7 +209,7 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = FastFieldSerializer::from_write(write)
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
@@ -148,6 +231,6 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
let field_file = composite_file
.open_read(field)
.expect("File component not found");
FastFieldReader::open(field_file).unwrap()
BitpackedFastFieldReader::open(field_file).unwrap()
}
}

View File

@@ -1,13 +1,16 @@
use crate::common::CompositeFile;
use crate::directory::FileSlice;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
use crate::fastfield::{BytesFastFieldReader, FastValue};
use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
use crate::TantivyError;
/// Provides access to all of the FastFieldReader.
use super::reader::DynamicFastFieldReader;
use super::FastFieldReader;
/// Provides access to all of the BitpackedFastFieldReader.
///
/// Internally, `FastFieldReaders` have preloaded fast field readers,
/// and just wraps several `HashMap`.
@@ -100,9 +103,9 @@ impl FastFieldReaders {
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<FastFieldReader<TFastValue>> {
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, 0)?;
FastFieldReader::open(fast_field_slice)
DynamicFastFieldReader::open(fast_field_slice)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
@@ -111,16 +114,16 @@ impl FastFieldReaders {
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
let idx_reader = FastFieldReader::open(fast_field_slice_idx)?;
let vals_reader: FastFieldReader<TFastValue> =
FastFieldReader::open(fast_field_slice_vals)?;
let idx_reader = BitpackedFastFieldReader::open(fast_field_slice_idx)?;
let vals_reader: BitpackedFastFieldReader<TFastValue> =
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}
/// Returns the `u64` fast field reader reader associated to `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -129,14 +132,14 @@ impl FastFieldReaders {
/// field is effectively of type `u64` or not.
///
/// If not, the fastfield reader will returns the u64-value associated to the original FastValue.
pub fn u64_lenient(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
self.typed_fast_field_reader(field)
}
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field: Field) -> crate::Result<FastFieldReader<i64>> {
pub fn i64(&self, field: Field) -> crate::Result<impl FastFieldReader<i64>> {
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -144,7 +147,7 @@ impl FastFieldReaders {
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn date(&self, field: Field) -> crate::Result<FastFieldReader<crate::DateTime>> {
pub fn date(&self, field: Field) -> crate::Result<impl FastFieldReader<crate::DateTime>> {
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -152,7 +155,7 @@ impl FastFieldReaders {
/// Returns the `f64` fast field reader reader associated to `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field: Field) -> crate::Result<FastFieldReader<f64>> {
pub fn f64(&self, field: Field) -> crate::Result<impl FastFieldReader<f64>> {
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -213,7 +216,7 @@ impl FastFieldReaders {
)));
}
let fast_field_idx_file = self.fast_field_data(field, 0)?;
let idx_reader = FastFieldReader::open(fast_field_idx_file)?;
let idx_reader = BitpackedFastFieldReader::open(fast_field_idx_file)?;
let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data)
} else {

View File

@@ -7,10 +7,10 @@ use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
/// `FastFieldSerializer` is in charge of serializing
/// `CompositeFastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
/// Fast fields are encoded using bit-packing.
/// Fast fields have differnt encodings like bit-packing.
///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
@@ -27,16 +27,16 @@ use tantivy_bitpacker::BitPacker;
/// * ...
/// * `close_field()`
/// * `close()`
pub struct FastFieldSerializer {
pub struct CompositeFastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
}
impl FastFieldSerializer {
impl CompositeFastFieldSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer { composite_write })
Ok(CompositeFastFieldSerializer { composite_write })
}
/// Start serializing a new u64 fast field
@@ -45,7 +45,7 @@ impl FastFieldSerializer {
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
) -> io::Result<DynamicFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
}
@@ -56,9 +56,9 @@ impl FastFieldSerializer {
min_value: u64,
max_value: u64,
idx: usize,
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
) -> io::Result<DynamicFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
FastSingleFieldSerializer::open(field_write, min_value, max_value)
DynamicFastFieldSerializer::open(field_write, min_value, max_value)
}
/// Start serializing a new [u8] fast field
@@ -79,14 +79,111 @@ impl FastFieldSerializer {
}
}
pub struct FastSingleFieldSerializer<'a, W: Write> {
#[derive(Debug, Clone)]
pub struct EstimationStats {
min_value: u64,
max_value: u64,
}
/// The FastFieldSerializer trait is the common interface
/// implemented by every fastfield serializer variant.
///
/// `DynamicFastFieldSerializer` is the enum wrapping all variants.
/// It is used to create an serializer instance.
pub trait FastFieldSerializer {
/// add value to serializer
fn add_val(&mut self, val: u64) -> io::Result<()>;
/// finish serializing a field.
fn close_field(self) -> io::Result<()>;
}
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldSerializerEstimate {
/// returns an estimate of the compression ratio.
fn estimate(
/*fastfield_accessor: impl FastFieldReader<u64>,*/ stats: EstimationStats,
) -> (f32, &'static str);
/// the unique name of the compressor
fn name() -> &'static str;
}
pub enum DynamicFastFieldSerializer<'a, W: Write> {
Bitpacked(BitpackedFastFieldSerializer<'a, W>),
}
impl<'a, W: Write> DynamicFastFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<DynamicFastFieldSerializer<'a, W>> {
let stats = EstimationStats {
min_value,
max_value,
};
let (_ratio, name) = (
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(stats),
BitpackedFastFieldSerializer::<Vec<u8>>::name(),
);
Self::open_from_name(write, min_value, max_value, name)
}
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub fn open_from_name(
write: &'a mut W,
min_value: u64,
max_value: u64,
name: &str,
) -> io::Result<DynamicFastFieldSerializer<'a, W>> {
// Weirdly the W generic on BitpackedFastFieldSerializer needs to be set,
// although name() doesn't use it
let variant = if name == BitpackedFastFieldSerializer::<Vec<u8>>::name() {
DynamicFastFieldSerializer::Bitpacked(BitpackedFastFieldSerializer::open(
write, min_value, max_value,
)?)
} else {
panic!("unknown fastfield serializer {}", name);
};
Ok(variant)
}
}
impl<'a, W: Write> FastFieldSerializer for DynamicFastFieldSerializer<'a, W> {
fn add_val(&mut self, val: u64) -> io::Result<()> {
match self {
Self::Bitpacked(serializer) => serializer.add_val(val),
}
}
fn close_field(self) -> io::Result<()> {
match self {
Self::Bitpacked(serializer) => serializer.close_field(),
}
}
}
pub struct BitpackedFastFieldSerializer<'a, W: Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_bits: u8,
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
@@ -99,34 +196,51 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
assert!(min_value <= max_value);
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new();
Ok(FastSingleFieldSerializer {
Ok(BitpackedFastFieldSerializer {
bit_packer,
write,
min_value,
num_bits,
})
}
}
impl<'a, W: 'a + Write> FastFieldSerializer for BitpackedFastFieldSerializer<'a, W> {
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
fn estimate(
/*_fastfield_accessor: impl FastFieldReader<u64>, */ stats: EstimationStats,
) -> (f32, &'static str) {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
let name = Self::name();
(ratio, name)
}
fn name() -> &'static str {
"Bitpacked"
}
}
pub struct FastBytesFieldSerializer<'a, W: Write> {
write: &'a mut W,
}

View File

@@ -1,6 +1,7 @@
use super::multivalued::MultiValuedFastFieldWriter;
use crate::common;
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
use crate::fastfield::serializer::FastFieldSerializer;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
@@ -148,7 +149,7 @@ impl FastFieldsWriter {
/// order to the fast field serializer.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
@@ -272,7 +273,7 @@ impl IntFastFieldWriter {
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
let (min, max) = if self.val_min > self.val_max {

View File

@@ -8,7 +8,6 @@ use crate::{
DocId, IndexSortByField, Order, TantivyError,
};
use std::cmp::Reverse;
/// Struct to provide mapping from old doc_id to new doc_id and vice versa
pub struct DocIdMapping {
new_doc_id_to_old: Vec<DocId>,
@@ -92,6 +91,7 @@ pub(crate) fn get_doc_id_mapping_from_field(
#[cfg(test)]
mod tests_indexsorting {
use crate::fastfield::FastFieldReader;
use crate::{collector::TopDocs, query::QueryParser, schema::*};
use crate::{schema::Schema, DocAddress};
use crate::{Index, IndexSettings, IndexSortByField, Order};

View File

@@ -1,6 +1,8 @@
use super::doc_id_mapping::DocIdMapping;
use crate::error::DataCorruption;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldReader;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::MultiValuedFastFieldReader;
@@ -87,7 +89,7 @@ pub struct IndexMerger {
}
fn compute_min_max_val(
u64_reader: &FastFieldReader<u64>,
u64_reader: &impl FastFieldReader<u64>,
max_doc: DocId,
delete_bitset_opt: Option<&DeleteBitSet>,
) -> Option<(u64, u64)> {
@@ -265,7 +267,7 @@ impl IndexMerger {
fn write_fast_fields(
&self,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
@@ -315,11 +317,11 @@ impl IndexMerger {
fn write_single_fast_field(
&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
let (min_value, max_value) = self.readers.iter().map(|reader|{
let u64_reader: FastFieldReader<u64> = reader
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
@@ -334,7 +336,7 @@ impl IndexMerger {
.readers
.iter()
.map(|reader| {
let u64_reader: FastFieldReader<u64> = reader
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
@@ -362,7 +364,7 @@ impl IndexMerger {
let u64_readers = self.readers.iter()
.filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0))
.map(|reader|{
let u64_reader: FastFieldReader<u64> = reader
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
@@ -413,7 +415,7 @@ impl IndexMerger {
pub(crate) fn get_sort_field_accessor<'b>(
reader: &SegmentReader,
sort_by_field: &'b IndexSortByField,
) -> crate::Result<FastFieldReader<u64>> {
) -> crate::Result<impl FastFieldReader<u64>> {
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Ok(value_accessor)
@@ -422,7 +424,12 @@ impl IndexMerger {
pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
&'a self,
sort_by_field: &'b IndexSortByField,
) -> crate::Result<Vec<(SegmentReaderWithOrdinal<'a>, FastFieldReader<u64>)>> {
) -> crate::Result<
Vec<(
SegmentReaderWithOrdinal<'a>,
impl FastFieldReader<u64> + Clone,
)>,
> {
let reader_and_field_accessors = self
.readers
.iter()
@@ -491,7 +498,7 @@ impl IndexMerger {
// is used to index the reader_and_field_accessors vec.
fn write_1_n_fast_field_idx_generic(
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
reader_and_field_accessors: &[(&SegmentReader, impl MultiValueLength)],
) -> crate::Result<()> {
@@ -546,7 +553,7 @@ impl IndexMerger {
fn write_multi_value_fast_field_idx(
&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
let reader_and_field_accessors = self.readers.iter().map(|reader|{
@@ -568,7 +575,7 @@ impl IndexMerger {
&self,
field: Field,
term_ordinal_mappings: &TermOrdinalMapping,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields.
@@ -631,7 +638,7 @@ impl IndexMerger {
fn write_multi_fast_field(
&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields.
@@ -718,7 +725,7 @@ impl IndexMerger {
fn write_bytes_fast_field(
&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
let reader_and_field_accessors = self
@@ -1088,6 +1095,7 @@ mod tests {
use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
use crate::collector::{Count, FacetCollector};
use crate::core::Index;
use crate::fastfield::FastFieldReader;
use crate::query::AllQuery;
use crate::query::BooleanQuery;
use crate::query::Scorer;

View File

@@ -1,5 +1,6 @@
#[cfg(test)]
mod tests {
use crate::fastfield::FastFieldReader;
use crate::{
collector::TopDocs,
schema::{Cardinality, TextFieldIndexing},

View File

@@ -1,6 +1,6 @@
use crate::core::Segment;
use crate::core::SegmentComponent;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fieldnorm::FieldNormsSerializer;
use crate::postings::InvertedIndexSerializer;
use crate::store::StoreWriter;
@@ -10,7 +10,7 @@ use crate::store::StoreWriter;
pub struct SegmentSerializer {
segment: Segment,
pub(crate) store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer,
fast_field_serializer: CompositeFastFieldSerializer,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
@@ -33,7 +33,7 @@ impl SegmentSerializer {
let store_write = segment.open_write(store_component)?;
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?;
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
@@ -68,7 +68,7 @@ impl SegmentSerializer {
}
/// Accessor to the `FastFieldSerializer`.
pub fn get_fast_field_serializer(&mut self) -> &mut FastFieldSerializer {
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
&mut self.fast_field_serializer
}

View File

@@ -291,6 +291,7 @@ mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::FastFieldReader;
use crate::query::BooleanQuery;
use crate::schema::*;
use crate::DocAddress;