mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
4 Commits
experiment
...
columnar
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8828b6d310 | ||
|
|
2b89bf9050 | ||
|
|
3580198447 | ||
|
|
d96a716d20 |
@@ -60,9 +60,9 @@ sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optiona
|
||||
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
|
||||
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
|
||||
common = { version= "0.4", path = "./common/", package = "tantivy-common" }
|
||||
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version= "0.4", path="./ownedbytes" }
|
||||
ownedbytes = { version= "0.5", path="./ownedbytes" }
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
@@ -108,7 +108,7 @@ unstable = [] # useful for benches.
|
||||
quickwit = ["sstable"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable"]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar"]
|
||||
|
||||
# Following the "fail" crate best practises, we isolate
|
||||
# tests that define specific behavior in fail check points
|
||||
|
||||
@@ -101,7 +101,7 @@ impl BitUnpacker {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift);
|
||||
let val_shifted: u64 = val_unshifted_unmasked >> bit_shift;
|
||||
val_shifted & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
26
columnar/Cargo.toml
Normal file
26
columnar/Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[package]
|
||||
name = "tantivy-columnar"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
stacker = { path = "../stacker", package="tantivy-stacker"}
|
||||
serde_json = "1"
|
||||
thiserror = "1"
|
||||
fnv = "1"
|
||||
tantivy-fst = "0.4.0"
|
||||
sstable = { path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { path = "../common", package = "tantivy-common" }
|
||||
fastfield_codecs = { path = "../fastfield_codecs"}
|
||||
ordered-float = "3.4"
|
||||
itertools = "0.10"
|
||||
|
||||
[features]
|
||||
# default = ["quickwit"]
|
||||
# quickwit = ["common/quickwit"]
|
||||
|
||||
|
||||
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1"
|
||||
33
columnar/README.md
Normal file
33
columnar/README.md
Normal file
@@ -0,0 +1,33 @@
|
||||
# Columnar format
|
||||
|
||||
This crate describes columnar format used in tantivy.
|
||||
|
||||
|
||||
## Goals
|
||||
|
||||
This format is special in the following way.
|
||||
- it needs to be compact
|
||||
- it does not required to be loaded in memory.
|
||||
- it is designed to fit well with quickwit's strange constraint:
|
||||
we need to be able to load columns rapidly.
|
||||
- columns of several types can be associated with the same column name.
|
||||
- it needs to support columns with different types `(str, u64, i64, f64)`
|
||||
and different cardinality `(required, optional, multivalued)`.
|
||||
- columns, once loaded, offer cheap random access.
|
||||
|
||||
# Format
|
||||
|
||||
A quickwit/tantivy style sstable associated
|
||||
`(column names, column_cardinality, column_type) to range of bytes.
|
||||
|
||||
The format of the key is:
|
||||
`[column_name][ZERO_BYTE][column_type_header: u8]`
|
||||
|
||||
Column name may not contain the zero byte.
|
||||
|
||||
Listing all columns associated to `column_name` can therefore
|
||||
be done by listing all keys prefixed by
|
||||
`[column_name][ZERO_BYTE]`
|
||||
|
||||
The associated range of bytes refer to a range of bytes
|
||||
|
||||
154
columnar/src/column_type_header.rs
Normal file
154
columnar/src/column_type_header.rs
Normal file
@@ -0,0 +1,154 @@
|
||||
use crate::value::NumericalType;
|
||||
|
||||
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[repr(u8)]
|
||||
pub enum Cardinality {
|
||||
#[default]
|
||||
Required = 0,
|
||||
Optional = 1,
|
||||
Multivalued = 2,
|
||||
}
|
||||
|
||||
impl Cardinality {
|
||||
pub fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub fn try_from_code(code: u8) -> Option<Cardinality> {
|
||||
match code {
|
||||
0 => Some(Cardinality::Required),
|
||||
1 => Some(Cardinality::Optional),
|
||||
2 => Some(Cardinality::Multivalued),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
pub enum ColumnType {
|
||||
Bytes,
|
||||
Numerical(NumericalType),
|
||||
}
|
||||
|
||||
impl ColumnType {
|
||||
pub fn to_code(self) -> u8 {
|
||||
match self {
|
||||
ColumnType::Bytes => 0u8,
|
||||
ColumnType::Numerical(numerical_type) => 1u8 | (numerical_type.to_code() << 1),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_from_code(code: u8) -> Option<ColumnType> {
|
||||
if code == 0u8 {
|
||||
return Some(ColumnType::Bytes);
|
||||
}
|
||||
if code & 1u8 == 0u8 {
|
||||
return None;
|
||||
}
|
||||
let numerical_type = NumericalType::try_from_code(code >> 1)?;
|
||||
Some(ColumnType::Numerical(numerical_type))
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the type and cardinality of a column.
|
||||
/// This is encoded over one-byte and added to a column key in the
|
||||
/// columnar sstable.
|
||||
///
|
||||
/// Cardinality is encoded as the first two highest two bits.
|
||||
/// The low 6 bits encode the column type.
|
||||
#[derive(Eq, Hash, PartialEq, Debug, Copy, Clone)]
|
||||
pub struct ColumnTypeAndCardinality {
|
||||
pub cardinality: Cardinality,
|
||||
pub typ: ColumnType,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
const fn compute_mask(num_bits: u8) -> u8 {
|
||||
if num_bits == 8 {
|
||||
u8::MAX
|
||||
} else {
|
||||
(1u8 << num_bits) - 1
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn select_bits<const START: u8, const END: u8>(code: u8) -> u8 {
|
||||
assert!(START <= END);
|
||||
assert!(END <= 8);
|
||||
let num_bits: u8 = END - START;
|
||||
let mask: u8 = compute_mask(num_bits);
|
||||
(code >> START) & mask
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn place_bits<const START: u8, const END: u8>(code: u8) -> u8 {
|
||||
assert!(START <= END);
|
||||
assert!(END <= 8);
|
||||
let num_bits: u8 = END - START;
|
||||
let mask: u8 = compute_mask(num_bits);
|
||||
assert!(code <= mask);
|
||||
code << START
|
||||
}
|
||||
|
||||
impl ColumnTypeAndCardinality {
|
||||
pub fn to_code(self) -> u8 {
|
||||
place_bits::<6, 8>(self.cardinality.to_code()) | place_bits::<0, 6>(self.typ.to_code())
|
||||
}
|
||||
|
||||
pub fn try_from_code(code: u8) -> Option<ColumnTypeAndCardinality> {
|
||||
let typ_code = select_bits::<0, 6>(code);
|
||||
let cardinality_code = select_bits::<6, 8>(code);
|
||||
let cardinality = Cardinality::try_from_code(cardinality_code)?;
|
||||
let typ = ColumnType::try_from_code(typ_code)?;
|
||||
assert_eq!(typ.to_code(), typ_code);
|
||||
Some(ColumnTypeAndCardinality { cardinality, typ })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::ColumnTypeAndCardinality;
|
||||
use crate::column_type_header::{Cardinality, ColumnType};
|
||||
|
||||
#[test]
|
||||
fn test_column_type_header_to_code() {
|
||||
let mut column_type_header_set: HashSet<ColumnTypeAndCardinality> = HashSet::new();
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
if let Some(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) {
|
||||
assert_eq!(column_type_header.to_code(), code);
|
||||
assert!(column_type_header_set.insert(column_type_header));
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
column_type_header_set.len(),
|
||||
3 /* cardinality */ * (1 + 3) // column_types
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_column_type_to_code() {
|
||||
let mut column_type_set: HashSet<ColumnType> = HashSet::new();
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
if let Some(column_type) = ColumnType::try_from_code(code) {
|
||||
assert_eq!(column_type.to_code(), code);
|
||||
assert!(column_type_set.insert(column_type));
|
||||
}
|
||||
}
|
||||
assert_eq!(column_type_set.len(), 1 + 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_to_code() {
|
||||
let mut num_cardinality = 0;
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
let cardinality_opt = Cardinality::try_from_code(code);
|
||||
if let Some(cardinality) = cardinality_opt {
|
||||
assert_eq!(cardinality.to_code(), code);
|
||||
num_cardinality += 1;
|
||||
}
|
||||
}
|
||||
assert_eq!(num_cardinality, 3);
|
||||
}
|
||||
}
|
||||
78
columnar/src/dictionary.rs
Normal file
78
columnar/src/dictionary.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use std::io;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
|
||||
fn fst_err_into_io_err(fst_err: tantivy_fst::Error) -> io::Error {
|
||||
match fst_err {
|
||||
tantivy_fst::Error::Fst(fst_err) => {
|
||||
io::Error::new(io::ErrorKind::Other, format!("FST Error: {:?}", fst_err))
|
||||
}
|
||||
tantivy_fst::Error::Io(io_err) => io_err,
|
||||
}
|
||||
}
|
||||
|
||||
/// `DictionaryBuilder` for dictionary encoding.
|
||||
///
|
||||
/// It stores the different terms encounterred and assigns them a temporary value
|
||||
/// we call unordered id.
|
||||
///
|
||||
/// Upon serialization, we will sort the ids and hence build a `UnorderedId -> Term ordinal`
|
||||
/// mapping.
|
||||
#[derive(Default)]
|
||||
pub struct DictionaryBuilder {
|
||||
dict: FnvHashMap<Vec<u8>, UnorderedId>,
|
||||
}
|
||||
|
||||
pub struct IdMapping {
|
||||
unordered_to_ord: Vec<OrderedId>,
|
||||
}
|
||||
|
||||
impl IdMapping {
|
||||
pub fn to_ord(&self, unordered: UnorderedId) -> OrderedId {
|
||||
self.unordered_to_ord[unordered.0 as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl DictionaryBuilder {
|
||||
/// Get or allocate an unordered id.
|
||||
/// (This ID is simply an auto-incremented id.)
|
||||
pub fn get_or_allocate_id(&mut self, term: &[u8]) -> UnorderedId {
|
||||
if let Some(term_id) = self.dict.get(term) {
|
||||
return *term_id;
|
||||
}
|
||||
let new_id = UnorderedId(self.dict.len() as u32);
|
||||
self.dict.insert(term.to_vec(), new_id);
|
||||
new_id
|
||||
}
|
||||
|
||||
/// Serialize the dictionary into an fst, and returns the
|
||||
/// `UnorderedId -> TermOrdinal` map.
|
||||
pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<IdMapping> {
|
||||
serialize_inner(&self.dict, wrt).map_err(fst_err_into_io_err)
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function just there for error conversion.
|
||||
fn serialize_inner<'a, W: io::Write + 'a>(
|
||||
dict: &FnvHashMap<Vec<u8>, UnorderedId>,
|
||||
wrt: &mut W,
|
||||
) -> tantivy_fst::Result<IdMapping> {
|
||||
let mut terms: Vec<(&[u8], UnorderedId)> =
|
||||
dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect();
|
||||
terms.sort_unstable_by_key(|(key, _)| *key);
|
||||
let mut unordered_to_ord: Vec<OrderedId> = vec![OrderedId(0u32); terms.len()];
|
||||
let mut fst_builder = tantivy_fst::MapBuilder::new(wrt)?;
|
||||
for (ord, (key, unordered_id)) in terms.into_iter().enumerate() {
|
||||
let ordered_id = OrderedId(ord as u32);
|
||||
fst_builder.insert(key, ord as u64)?;
|
||||
unordered_to_ord[unordered_id.0 as usize] = ordered_id;
|
||||
}
|
||||
fst_builder.finish()?;
|
||||
Ok(IdMapping { unordered_to_ord })
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct UnorderedId(pub u32);
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct OrderedId(pub u32);
|
||||
69
columnar/src/lib.rs
Normal file
69
columnar/src/lib.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
// Copyright (C) 2022 Quickwit, Inc.
|
||||
//
|
||||
// Quickwit is offered under the AGPL v3.0 and as commercial software.
|
||||
// For commercial licensing, contact us at hello@quickwit.io.
|
||||
//
|
||||
// AGPL:
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
mod column_type_header;
|
||||
mod dictionary;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
mod value;
|
||||
mod writer;
|
||||
|
||||
pub use column_type_header::Cardinality;
|
||||
pub use reader::ColumnarReader;
|
||||
pub use serializer::ColumnarSerializer;
|
||||
pub use writer::ColumnarWriter;
|
||||
|
||||
pub type DocId = u32;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Range;
|
||||
|
||||
use common::file_slice::FileSlice;
|
||||
|
||||
use crate::column_type_header::ColumnTypeAndCardinality;
|
||||
use crate::reader::ColumnarReader;
|
||||
use crate::serializer::ColumnarSerializer;
|
||||
use crate::value::NumericalValue;
|
||||
use crate::ColumnarWriter;
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_writer() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(1u32, b"srical.value", NumericalValue::U64(1u64));
|
||||
dataframe_writer.record_numerical(2u32, b"srical.value", NumericalValue::U64(2u64));
|
||||
dataframe_writer.record_numerical(4u32, b"srical.value", NumericalValue::I64(2i64));
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
let serializer = ColumnarSerializer::new(&mut buffer);
|
||||
dataframe_writer.serialize(5, serializer).unwrap();
|
||||
let columnar_fileslice = FileSlice::from(buffer);
|
||||
let columnar = ColumnarReader::open(columnar_fileslice).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<(ColumnTypeAndCardinality, Range<u64>)> =
|
||||
columnar.read_columns("srical.value").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
// Right now this 31 bytes are spent as follows
|
||||
//
|
||||
// - header 14 bytes
|
||||
// - vals 8 //< due to padding? could have been 1byte?.
|
||||
// - null footer 6 bytes
|
||||
// - version footer 3 bytes // Should be file-wide
|
||||
assert_eq!(cols[0].1, 0..31);
|
||||
}
|
||||
}
|
||||
66
columnar/src/reader/mod.rs
Normal file
66
columnar/src/reader/mod.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use std::ops::Range;
|
||||
use std::{io, mem};
|
||||
|
||||
use common::file_slice::FileSlice;
|
||||
use common::BinarySerializable;
|
||||
use sstable::{Dictionary, SSTableRange};
|
||||
|
||||
use crate::column_type_header::ColumnTypeAndCardinality;
|
||||
|
||||
fn io_invalid_data(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::InvalidData, msg) // format!("Invalid key found.
|
||||
// {key_bytes:?}")));
|
||||
}
|
||||
pub struct ColumnarReader {
|
||||
column_dictionary: Dictionary<SSTableRange>,
|
||||
column_data: FileSlice,
|
||||
}
|
||||
|
||||
impl ColumnarReader {
|
||||
pub fn num_columns(&self) -> usize {
|
||||
self.column_dictionary.num_terms()
|
||||
}
|
||||
|
||||
pub fn open(file_slice: FileSlice) -> io::Result<ColumnarReader> {
|
||||
let (file_slice_without_sstable_len, sstable_len_bytes) =
|
||||
file_slice.split_from_end(mem::size_of::<u64>());
|
||||
let mut sstable_len_bytes = sstable_len_bytes.read_bytes()?;
|
||||
let sstable_len = u64::deserialize(&mut sstable_len_bytes)?;
|
||||
let (column_data, sstable) =
|
||||
file_slice_without_sstable_len.split_from_end(sstable_len as usize);
|
||||
let column_dictionary = Dictionary::open(sstable)?;
|
||||
Ok(ColumnarReader {
|
||||
column_dictionary,
|
||||
column_data,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn read_columns(
|
||||
&self,
|
||||
field_name: &str,
|
||||
) -> io::Result<Vec<(ColumnTypeAndCardinality, Range<u64>)>> {
|
||||
let mut start_key = field_name.to_string();
|
||||
start_key.push('\0');
|
||||
let mut end_key = field_name.to_string();
|
||||
end_key.push(1u8 as char);
|
||||
let mut stream = self
|
||||
.column_dictionary
|
||||
.range()
|
||||
.ge(start_key.as_bytes())
|
||||
.lt(end_key.as_bytes())
|
||||
.into_stream()?;
|
||||
let mut results = Vec::new();
|
||||
while stream.advance() {
|
||||
let key_bytes: &[u8] = stream.key();
|
||||
if !key_bytes.starts_with(start_key.as_bytes()) {
|
||||
return Err(io_invalid_data(format!("Invalid key found. {key_bytes:?}")));
|
||||
}
|
||||
let column_code: u8 = key_bytes.last().cloned().unwrap();
|
||||
let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code)
|
||||
.ok_or_else(|| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
|
||||
let range = stream.value().clone();
|
||||
results.push((column_type_and_cardinality, range));
|
||||
}
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
39
columnar/src/serializer.rs
Normal file
39
columnar/src/serializer.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::ops::Range;
|
||||
|
||||
use common::CountingWriter;
|
||||
use sstable::value::RangeWriter;
|
||||
use sstable::SSTableRange;
|
||||
|
||||
pub struct ColumnarSerializer<W: io::Write> {
|
||||
wrt: CountingWriter<W>,
|
||||
sstable_range: sstable::Writer<Vec<u8>, RangeWriter>,
|
||||
}
|
||||
|
||||
impl<W: io::Write> ColumnarSerializer<W> {
|
||||
pub fn new(wrt: W) -> ColumnarSerializer<W> {
|
||||
let sstable_range: sstable::Writer<Vec<u8>, RangeWriter> =
|
||||
sstable::Dictionary::<SSTableRange>::builder(Vec::with_capacity(100_000)).unwrap();
|
||||
ColumnarSerializer {
|
||||
wrt: CountingWriter::wrap(wrt),
|
||||
sstable_range,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_column_offsets(&mut self, key: &[u8], byte_range: Range<u64>) -> io::Result<()> {
|
||||
self.sstable_range.insert(key, &byte_range)
|
||||
}
|
||||
|
||||
pub fn wrt(&mut self) -> &mut CountingWriter<W> {
|
||||
&mut self.wrt
|
||||
}
|
||||
|
||||
pub fn finalize(mut self) -> io::Result<()> {
|
||||
let sstable_bytes: Vec<u8> = self.sstable_range.finish()?;
|
||||
let sstable_num_bytes: u64 = sstable_bytes.len() as u64;
|
||||
self.wrt.write_all(&sstable_bytes)?;
|
||||
self.wrt.write_all(&sstable_num_bytes.to_le_bytes()[..])?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
123
columnar/src/value.rs
Normal file
123
columnar/src/value.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use ordered_float::NotNan;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub enum NumericalValue {
|
||||
I64(i64),
|
||||
U64(u64),
|
||||
F64(NotNan<f64>),
|
||||
}
|
||||
|
||||
impl From<u64> for NumericalValue {
|
||||
fn from(val: u64) -> NumericalValue {
|
||||
NumericalValue::U64(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<i64> for NumericalValue {
|
||||
fn from(val: i64) -> Self {
|
||||
NumericalValue::I64(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NotNan<f64>> for NumericalValue {
|
||||
fn from(val: NotNan<f64>) -> Self {
|
||||
NumericalValue::F64(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl NumericalValue {
|
||||
pub fn numerical_type(&self) -> NumericalType {
|
||||
match self {
|
||||
NumericalValue::F64(_) => NumericalType::F64,
|
||||
NumericalValue::I64(_) => NumericalType::I64,
|
||||
NumericalValue::U64(_) => NumericalType::U64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for NumericalValue {}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Hash, Eq, PartialEq)]
|
||||
#[repr(u8)]
|
||||
pub enum NumericalType {
|
||||
#[default]
|
||||
I64 = 0,
|
||||
U64 = 1,
|
||||
F64 = 2,
|
||||
}
|
||||
|
||||
impl NumericalType {
|
||||
pub fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub fn try_from_code(code: u8) -> Option<NumericalType> {
|
||||
match code {
|
||||
0 => Some(NumericalType::I64),
|
||||
1 => Some(NumericalType::U64),
|
||||
2 => Some(NumericalType::F64),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// We voluntarily avoid using `Into` here to keep this
|
||||
/// implementation quirk as private as possible.
|
||||
///
|
||||
/// This coercion trait actually panics if it is used
|
||||
/// to convert a loose types to a stricter type.
|
||||
///
|
||||
/// The level is strictness is somewhat arbitrary.
|
||||
/// - i64
|
||||
/// - u64
|
||||
/// - f64.
|
||||
pub(crate) trait Coerce {
|
||||
fn coerce(numerical_value: NumericalValue) -> Self;
|
||||
}
|
||||
|
||||
impl Coerce for i64 {
|
||||
fn coerce(value: NumericalValue) -> Self {
|
||||
match value {
|
||||
NumericalValue::I64(val) => val,
|
||||
NumericalValue::U64(val) => val as i64,
|
||||
NumericalValue::F64(_) => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Coerce for u64 {
|
||||
fn coerce(value: NumericalValue) -> Self {
|
||||
match value {
|
||||
NumericalValue::I64(val) => val as u64,
|
||||
NumericalValue::U64(val) => val,
|
||||
NumericalValue::F64(_) => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Coerce for NotNan<f64> {
|
||||
fn coerce(value: NumericalValue) -> Self {
|
||||
match value {
|
||||
NumericalValue::I64(val) => unsafe { NotNan::new_unchecked(val as f64) },
|
||||
NumericalValue::U64(val) => unsafe { NotNan::new_unchecked(val as f64) },
|
||||
NumericalValue::F64(val) => val,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::NumericalType;
|
||||
|
||||
#[test]
|
||||
fn test_numerical_type_code() {
|
||||
let mut num_numerical_type = 0;
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
if let Some(numerical_type) = NumericalType::try_from_code(code) {
|
||||
assert_eq!(numerical_type.to_code(), code);
|
||||
num_numerical_type += 1;
|
||||
}
|
||||
}
|
||||
assert_eq!(num_numerical_type, 3);
|
||||
}
|
||||
}
|
||||
321
columnar/src/writer/column_operation.rs
Normal file
321
columnar/src/writer/column_operation.rs
Normal file
@@ -0,0 +1,321 @@
|
||||
use std::fmt;
|
||||
use std::num::NonZeroU8;
|
||||
|
||||
use ordered_float::NotNan;
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::dictionary::UnorderedId;
|
||||
use crate::value::NumericalValue;
|
||||
use crate::DocId;
|
||||
|
||||
/// When we build a columnar dataframe, we first just group
|
||||
/// all mutations per column, and append them in append-only object.
|
||||
///
|
||||
/// We represents all of these operations as `ColumnOperation`.
|
||||
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
|
||||
pub(crate) enum ColumnOperation<T> {
|
||||
NewDoc(DocId),
|
||||
Value(T),
|
||||
}
|
||||
|
||||
impl<T> From<T> for ColumnOperation<T> {
|
||||
fn from(value: T) -> Self {
|
||||
ColumnOperation::Value(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::from_over_into)]
|
||||
pub(crate) trait SymbolValue: Into<MiniBuffer> + Clone + Copy + fmt::Debug {
|
||||
fn deserialize(header: NonZeroU8, bytes: &mut &[u8]) -> Result<Self, ParseError>;
|
||||
}
|
||||
|
||||
pub(crate) struct MiniBuffer {
|
||||
pub bytes: [u8; 9],
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl MiniBuffer {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.bytes[..self.len]
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_header_byte(typ: SymbolType, len: usize) -> u8 {
|
||||
assert!(len <= 9);
|
||||
(len << 4) as u8 | typ as u8
|
||||
}
|
||||
|
||||
impl SymbolValue for NumericalValue {
|
||||
fn deserialize(header_byte: NonZeroU8, bytes: &mut &[u8]) -> Result<Self, ParseError> {
|
||||
let (typ, len) = parse_header_byte(header_byte)?;
|
||||
let value_bytes: &[u8];
|
||||
(value_bytes, *bytes) = bytes.split_at(len);
|
||||
let symbol: NumericalValue = match typ {
|
||||
SymbolType::U64 => {
|
||||
let mut octet: [u8; 8] = [0u8; 8];
|
||||
octet[..value_bytes.len()].copy_from_slice(value_bytes);
|
||||
let val: u64 = u64::from_le_bytes(octet);
|
||||
NumericalValue::U64(val)
|
||||
}
|
||||
SymbolType::I64 => {
|
||||
let mut octet: [u8; 8] = [0u8; 8];
|
||||
octet[..value_bytes.len()].copy_from_slice(value_bytes);
|
||||
let encoded: u64 = u64::from_le_bytes(octet);
|
||||
let val: i64 = decode_zig_zag(encoded);
|
||||
NumericalValue::I64(val)
|
||||
}
|
||||
SymbolType::Float => {
|
||||
let octet: [u8; 8] =
|
||||
value_bytes.try_into().map_err(|_| ParseError::InvalidLen {
|
||||
typ: SymbolType::Float,
|
||||
len,
|
||||
})?;
|
||||
let val_possibly_nan = f64::from_le_bytes(octet);
|
||||
let val_not_nan = NotNan::new(val_possibly_nan)
|
||||
.map_err(|_| ParseError::NaN)?;
|
||||
NumericalValue::F64(val_not_nan)
|
||||
}
|
||||
};
|
||||
Ok(symbol)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::from_over_into)]
|
||||
impl Into<MiniBuffer> for NumericalValue {
|
||||
fn into(self) -> MiniBuffer {
|
||||
let mut bytes = [0u8; 9];
|
||||
match self {
|
||||
NumericalValue::F64(val) => {
|
||||
let len = 8;
|
||||
let header_byte = compute_header_byte(SymbolType::Float, len);
|
||||
bytes[0] = header_byte;
|
||||
bytes[1..].copy_from_slice(&val.to_le_bytes());
|
||||
MiniBuffer {
|
||||
bytes,
|
||||
len: len + 1,
|
||||
}
|
||||
}
|
||||
NumericalValue::U64(val) => {
|
||||
let len = compute_num_bytes_for_u64(val);
|
||||
let header_byte = compute_header_byte(SymbolType::U64, len);
|
||||
bytes[0] = header_byte;
|
||||
bytes[1..].copy_from_slice(&val.to_le_bytes());
|
||||
MiniBuffer {
|
||||
bytes,
|
||||
len: len + 1,
|
||||
}
|
||||
}
|
||||
NumericalValue::I64(val) => {
|
||||
let encoded = encode_zig_zag(val);
|
||||
let len = compute_num_bytes_for_u64(encoded);
|
||||
let header_byte = compute_header_byte(SymbolType::I64, len);
|
||||
bytes[0] = header_byte;
|
||||
bytes[1..].copy_from_slice(&encoded.to_le_bytes());
|
||||
MiniBuffer {
|
||||
bytes,
|
||||
len: len + 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::from_over_into)]
|
||||
impl Into<MiniBuffer> for UnorderedId {
|
||||
fn into(self) -> MiniBuffer {
|
||||
let mut bytes = [0u8; 9];
|
||||
let val = self.0 as u64;
|
||||
let len = compute_num_bytes_for_u64(val) + 1;
|
||||
bytes[0] = len as u8;
|
||||
bytes[1..].copy_from_slice(&val.to_le_bytes());
|
||||
MiniBuffer { bytes, len }
|
||||
}
|
||||
}
|
||||
|
||||
impl SymbolValue for UnorderedId {
|
||||
fn deserialize(header: NonZeroU8, bytes: &mut &[u8]) -> Result<UnorderedId, ParseError> {
|
||||
let len = header.get() as usize;
|
||||
let symbol_bytes: &[u8];
|
||||
(symbol_bytes, *bytes) = bytes.split_at(len);
|
||||
let mut value_bytes = [0u8; 4];
|
||||
value_bytes[..len - 1].copy_from_slice(&symbol_bytes[1..]);
|
||||
let value = u32::from_le_bytes(value_bytes);
|
||||
Ok(UnorderedId(value))
|
||||
}
|
||||
}
|
||||
|
||||
const HEADER_MASK: u8 = (1u8 << 4) - 1u8;
|
||||
|
||||
fn compute_num_bytes_for_u64(val: u64) -> usize {
|
||||
let msb = (64u32 - val.leading_zeros()) as usize;
|
||||
(msb + 7) / 8
|
||||
}
|
||||
|
||||
fn parse_header_byte(byte: NonZeroU8) -> Result<(SymbolType, usize), ParseError> {
|
||||
let len = (byte.get() as usize) >> 4;
|
||||
let typ_code = byte.get() & HEADER_MASK;
|
||||
let typ = SymbolType::try_from(typ_code)?;
|
||||
Ok((typ, len))
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ParseError {
|
||||
#[error("Type byte unknown `{0}`")]
|
||||
UnknownType(u8),
|
||||
#[error("Invalid len for type `{len}` for type `{typ:?}`.")]
|
||||
InvalidLen { typ: SymbolType, len: usize },
|
||||
#[error("Missing bytes.")]
|
||||
MissingBytes,
|
||||
#[error("Not a number value.")]
|
||||
NaN,
|
||||
}
|
||||
|
||||
impl<V: SymbolValue> ColumnOperation<V> {
|
||||
pub fn serialize(self) -> MiniBuffer {
|
||||
match self {
|
||||
ColumnOperation::NewDoc(doc) => {
|
||||
let mut minibuf: [u8; 9] = [0u8; 9];
|
||||
minibuf[0] = 0u8;
|
||||
minibuf[1..5].copy_from_slice(&doc.to_le_bytes());
|
||||
MiniBuffer {
|
||||
bytes: minibuf,
|
||||
len: 5,
|
||||
}
|
||||
}
|
||||
ColumnOperation::Value(val) => val.into(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn deserialize(bytes: &mut &[u8]) -> Result<Self, ParseError> {
|
||||
if bytes.is_empty() {
|
||||
return Err(ParseError::MissingBytes);
|
||||
}
|
||||
let header_byte = bytes[0];
|
||||
*bytes = &bytes[1..];
|
||||
if let Some(header_byte) = NonZeroU8::new(header_byte) {
|
||||
let value = V::deserialize(header_byte, bytes)?;
|
||||
Ok(ColumnOperation::Value(value))
|
||||
} else {
|
||||
let doc_bytes: &[u8];
|
||||
(doc_bytes, *bytes) = bytes.split_at(4);
|
||||
let doc: u32 =
|
||||
u32::from_le_bytes(doc_bytes.try_into().map_err(|_| ParseError::MissingBytes)?);
|
||||
Ok(ColumnOperation::NewDoc(doc))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
#[repr(u8)]
|
||||
pub enum SymbolType {
|
||||
U64 = 1u8,
|
||||
I64 = 2u8,
|
||||
Float = 3u8,
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for SymbolType {
|
||||
type Error = ParseError;
|
||||
|
||||
fn try_from(byte: u8) -> Result<Self, ParseError> {
|
||||
match byte {
|
||||
1u8 => Ok(SymbolType::U64),
|
||||
2u8 => Ok(SymbolType::I64),
|
||||
3u8 => Ok(SymbolType::Float),
|
||||
_ => Err(ParseError::UnknownType(byte)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn encode_zig_zag(n: i64) -> u64 {
|
||||
((n << 1) ^ (n >> 63)) as u64
|
||||
}
|
||||
|
||||
fn decode_zig_zag(n: u64) -> i64 {
|
||||
((n >> 1) as i64) ^ (-((n & 1) as i64))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{SymbolType, *};
|
||||
|
||||
#[track_caller]
|
||||
fn test_zig_zag_aux(val: i64) {
|
||||
let encoded = super::encode_zig_zag(val);
|
||||
assert_eq!(decode_zig_zag(encoded), val);
|
||||
if let Some(abs_val) = val.checked_abs() {
|
||||
let abs_val = abs_val as u64;
|
||||
assert!(encoded <= abs_val * 2);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_zig_zag() {
|
||||
assert_eq!(encode_zig_zag(0i64), 0u64);
|
||||
assert_eq!(encode_zig_zag(-1i64), 1u64);
|
||||
assert_eq!(encode_zig_zag(1i64), 2u64);
|
||||
test_zig_zag_aux(0i64);
|
||||
test_zig_zag_aux(i64::MIN);
|
||||
test_zig_zag_aux(i64::MAX);
|
||||
}
|
||||
|
||||
use proptest::prelude::any;
|
||||
use proptest::proptest;
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_proptest_zig_zag(val in any::<i64>()) {
|
||||
test_zig_zag_aux(val);
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn ser_deser_header_byte_aux(symbol_type: SymbolType, len: usize) {
|
||||
let header_byte = compute_header_byte(symbol_type, len);
|
||||
let (serdeser_numerical_type, serdeser_len) =
|
||||
parse_header_byte(NonZeroU8::new(header_byte).unwrap()).unwrap();
|
||||
assert_eq!(symbol_type, serdeser_numerical_type);
|
||||
assert_eq!(len, serdeser_len);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_header_byte_serialization() {
|
||||
for len in 1..9 {
|
||||
ser_deser_header_byte_aux(SymbolType::Float, len);
|
||||
ser_deser_header_byte_aux(SymbolType::I64, len);
|
||||
ser_deser_header_byte_aux(SymbolType::U64, len);
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn ser_deser_symbol(symbol: ColumnOperation<NumericalValue>) {
|
||||
let buf = symbol.serialize();
|
||||
let mut bytes = &buf.bytes[..];
|
||||
let serdeser_symbol = ColumnOperation::deserialize(&mut bytes).unwrap();
|
||||
assert_eq!(bytes.len() + buf.len, buf.bytes.len());
|
||||
assert_eq!(symbol, serdeser_symbol);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bytes_for_u64() {
|
||||
assert_eq!(compute_num_bytes_for_u64(0), 0);
|
||||
assert_eq!(compute_num_bytes_for_u64(1), 1);
|
||||
assert_eq!(compute_num_bytes_for_u64(255), 1);
|
||||
assert_eq!(compute_num_bytes_for_u64(256), 2);
|
||||
assert_eq!(compute_num_bytes_for_u64((1 << 16) - 1), 2);
|
||||
assert_eq!(compute_num_bytes_for_u64(1 << 16), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_symbol_serialization() {
|
||||
ser_deser_symbol(ColumnOperation::NewDoc(0));
|
||||
ser_deser_symbol(ColumnOperation::NewDoc(3));
|
||||
ser_deser_symbol(ColumnOperation::Value(NumericalValue::I64(0i64)));
|
||||
ser_deser_symbol(ColumnOperation::Value(NumericalValue::I64(1i64)));
|
||||
ser_deser_symbol(ColumnOperation::Value(NumericalValue::U64(257u64)));
|
||||
ser_deser_symbol(ColumnOperation::Value(NumericalValue::I64(-257i64)));
|
||||
ser_deser_symbol(ColumnOperation::Value(NumericalValue::I64(i64::MIN)));
|
||||
ser_deser_symbol(ColumnOperation::Value(NumericalValue::U64(0u64)));
|
||||
ser_deser_symbol(ColumnOperation::Value(NumericalValue::U64(u64::MIN)));
|
||||
ser_deser_symbol(ColumnOperation::Value(NumericalValue::U64(u64::MAX)));
|
||||
}
|
||||
}
|
||||
675
columnar/src/writer/mod.rs
Normal file
675
columnar/src/writer/mod.rs
Normal file
@@ -0,0 +1,675 @@
|
||||
mod column_operation;
|
||||
mod value_index;
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use column_operation::ColumnOperation;
|
||||
use common::CountingWriter;
|
||||
use fastfield_codecs::serialize::ValueIndexInfo;
|
||||
use fastfield_codecs::{Column, MonotonicallyMappableToU64, VecColumn};
|
||||
use ordered_float::NotNan;
|
||||
use stacker::{Addr, ArenaHashMap, ExpUnrolledLinkedList, MemoryArena};
|
||||
|
||||
use crate::column_type_header::{ColumnType, ColumnTypeAndCardinality};
|
||||
use crate::dictionary::{DictionaryBuilder, IdMapping, UnorderedId};
|
||||
use crate::value::{Coerce, NumericalType, NumericalValue};
|
||||
use crate::writer::column_operation::SymbolValue;
|
||||
use crate::writer::value_index::{IndexBuilder, SpareIndexBuilders};
|
||||
use crate::{Cardinality, ColumnarSerializer, DocId};
|
||||
|
||||
#[derive(Copy, Clone, Default)]
|
||||
struct ColumnWriter {
|
||||
// Detected cardinality of the column so far.
|
||||
cardinality: Cardinality,
|
||||
// Last document inserted.
|
||||
// None if no doc has been added yet.
|
||||
last_doc_opt: Option<u32>,
|
||||
// Buffer containing the serialized values.
|
||||
values: ExpUnrolledLinkedList,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Default)]
|
||||
pub struct NumericalColumnWriter {
|
||||
compatible_numerical_types: CompatibleNumericalTypes,
|
||||
column_writer: ColumnWriter,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct CompatibleNumericalTypes {
|
||||
all_values_within_i64_range: bool,
|
||||
all_values_within_u64_range: bool,
|
||||
}
|
||||
|
||||
impl Default for CompatibleNumericalTypes {
|
||||
fn default() -> CompatibleNumericalTypes {
|
||||
CompatibleNumericalTypes {
|
||||
all_values_within_i64_range: true,
|
||||
all_values_within_u64_range: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CompatibleNumericalTypes {
|
||||
pub fn accept_value(&mut self, numerical_value: NumericalValue) {
|
||||
match numerical_value {
|
||||
NumericalValue::I64(val_i64) => {
|
||||
let value_within_u64_range = val_i64 >= 0i64;
|
||||
self.all_values_within_u64_range &= value_within_u64_range;
|
||||
}
|
||||
NumericalValue::U64(val_u64) => {
|
||||
let value_within_i64_range = val_u64 < i64::MAX as u64;
|
||||
self.all_values_within_i64_range &= value_within_i64_range;
|
||||
}
|
||||
NumericalValue::F64(_) => {
|
||||
self.all_values_within_i64_range = false;
|
||||
self.all_values_within_u64_range = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_numerical_type(self) -> NumericalType {
|
||||
if self.all_values_within_i64_range {
|
||||
NumericalType::I64
|
||||
} else if self.all_values_within_u64_range {
|
||||
NumericalType::U64
|
||||
} else {
|
||||
NumericalType::F64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl NumericalColumnWriter {
|
||||
pub fn record_numerical_value(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
value: NumericalValue,
|
||||
arena: &mut MemoryArena,
|
||||
) {
|
||||
self.compatible_numerical_types.accept_value(value);
|
||||
self.column_writer.record(doc, value, arena);
|
||||
}
|
||||
}
|
||||
|
||||
impl ColumnWriter {
|
||||
fn symbol_iterator<'a, V: SymbolValue>(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
|
||||
buffer.clear();
|
||||
self.values.read_to_end(arena, buffer);
|
||||
let mut cursor: &[u8] = &buffer[..];
|
||||
std::iter::from_fn(move || {
|
||||
if cursor.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let symbol = ColumnOperation::deserialize(&mut cursor)
|
||||
.expect("Failed to deserialize symbol from in-memory. This should never happen.");
|
||||
Some(symbol)
|
||||
})
|
||||
}
|
||||
|
||||
fn delta_with_last_doc(&self, doc: DocId) -> u32 {
|
||||
self.last_doc_opt
|
||||
.map(|last_doc| doc - last_doc)
|
||||
.unwrap_or(doc + 1u32)
|
||||
}
|
||||
|
||||
/// Records a change of the document being recorded.
|
||||
///
|
||||
/// This function will also update the cardinality of the column
|
||||
/// if necessary.
|
||||
fn record(&mut self, doc: DocId, value: NumericalValue, arena: &mut MemoryArena) {
|
||||
// Difference between `doc` and the last doc.
|
||||
match self.delta_with_last_doc(doc) {
|
||||
0 => {
|
||||
// This is the last encounterred document.
|
||||
self.cardinality = Cardinality::Multivalued;
|
||||
}
|
||||
1 => {
|
||||
self.last_doc_opt = Some(doc);
|
||||
self.write_symbol::<NumericalValue>(ColumnOperation::NewDoc(doc), arena);
|
||||
}
|
||||
_ => {
|
||||
self.cardinality = self.cardinality.max(Cardinality::Optional);
|
||||
self.last_doc_opt = Some(doc);
|
||||
self.write_symbol::<NumericalValue>(ColumnOperation::NewDoc(doc), arena);
|
||||
}
|
||||
}
|
||||
self.write_symbol(ColumnOperation::Value(value), arena);
|
||||
}
|
||||
|
||||
// Get the cardinality.
|
||||
// The overall number of docs in the column is necessary to
|
||||
// deal with the case where the all docs contain 1 value, except some documents
|
||||
// at the end of the column.
|
||||
fn get_cardinality(&self, num_docs: DocId) -> Cardinality {
|
||||
if self.delta_with_last_doc(num_docs) > 1 {
|
||||
self.cardinality.max(Cardinality::Optional)
|
||||
} else {
|
||||
self.cardinality
|
||||
}
|
||||
}
|
||||
|
||||
fn write_symbol<V: SymbolValue>(
|
||||
&mut self,
|
||||
symbol: ColumnOperation<V>,
|
||||
arena: &mut MemoryArena,
|
||||
) {
|
||||
self.values
|
||||
.writer(arena)
|
||||
.extend_from_slice(symbol.serialize().as_slice());
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default)]
|
||||
pub struct BytesColumnWriter {
|
||||
dictionary_id: u32,
|
||||
column_writer: ColumnWriter,
|
||||
}
|
||||
|
||||
impl BytesColumnWriter {
|
||||
pub fn with_dictionary_id(dictionary_id: u32) -> BytesColumnWriter {
|
||||
BytesColumnWriter {
|
||||
dictionary_id,
|
||||
column_writer: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_bytes(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
bytes: &[u8],
|
||||
dictionaries: &mut [DictionaryBuilder],
|
||||
arena: &mut MemoryArena,
|
||||
) {
|
||||
let unordered_id = dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes);
|
||||
let numerical_value = NumericalValue::U64(unordered_id.0 as u64);
|
||||
self.column_writer.record(doc, numerical_value, arena);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ColumnarWriter {
|
||||
numerical_field_hash_map: ArenaHashMap,
|
||||
bytes_field_hash_map: ArenaHashMap,
|
||||
arena: MemoryArena,
|
||||
// Dictionaries used to store dictionary-encoded values.
|
||||
dictionaries: Vec<DictionaryBuilder>,
|
||||
buffers: SpareBuffers,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct SpareBuffers {
|
||||
byte_buffer: Vec<u8>,
|
||||
value_index_builders: SpareIndexBuilders,
|
||||
i64_values: Vec<i64>,
|
||||
u64_values: Vec<u64>,
|
||||
f64_values: Vec<ordered_float::NotNan<f64>>,
|
||||
}
|
||||
|
||||
impl Default for ColumnarWriter {
|
||||
fn default() -> Self {
|
||||
ColumnarWriter {
|
||||
numerical_field_hash_map: ArenaHashMap::new(10_000),
|
||||
bytes_field_hash_map: ArenaHashMap::new(10_000),
|
||||
dictionaries: Vec::new(),
|
||||
arena: MemoryArena::default(),
|
||||
buffers: SpareBuffers::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
|
||||
enum BytesOrNumerical {
|
||||
Bytes,
|
||||
Numerical,
|
||||
}
|
||||
|
||||
impl ColumnarWriter {
|
||||
pub fn record_numerical(&mut self, doc: DocId, key: &[u8], numerical_value: NumericalValue) {
|
||||
let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena);
|
||||
hash_map.mutate_or_create(key, |column_opt: Option<NumericalColumnWriter>| {
|
||||
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
|
||||
column.record_numerical_value(doc, numerical_value, arena);
|
||||
column
|
||||
});
|
||||
}
|
||||
|
||||
pub fn record_bytes(&mut self, doc: DocId, key: &[u8], value: &[u8]) {
|
||||
let (hash_map, arena, dictionaries) = (
|
||||
&mut self.bytes_field_hash_map,
|
||||
&mut self.arena,
|
||||
&mut self.dictionaries,
|
||||
);
|
||||
hash_map.mutate_or_create(key, |column_opt: Option<BytesColumnWriter>| {
|
||||
let mut column: BytesColumnWriter = column_opt.unwrap_or_else(|| {
|
||||
let dictionary_id = dictionaries.len() as u32;
|
||||
dictionaries.push(DictionaryBuilder::default());
|
||||
BytesColumnWriter::with_dictionary_id(dictionary_id)
|
||||
});
|
||||
column.record_bytes(doc, value, dictionaries, arena);
|
||||
column
|
||||
});
|
||||
}
|
||||
|
||||
pub fn serialize<W: io::Write>(
|
||||
&mut self,
|
||||
num_docs: DocId,
|
||||
mut serializer: ColumnarSerializer<W>,
|
||||
) -> io::Result<()> {
|
||||
let mut field_columns: Vec<(&[u8], BytesOrNumerical, Addr)> = self
|
||||
.numerical_field_hash_map
|
||||
.iter()
|
||||
.map(|(term, addr, _)| (term, BytesOrNumerical::Numerical, addr))
|
||||
.collect();
|
||||
field_columns.extend(
|
||||
self.bytes_field_hash_map
|
||||
.iter()
|
||||
.map(|(term, addr, _)| (term, BytesOrNumerical::Bytes, addr)),
|
||||
);
|
||||
let mut key_buffer = Vec::new();
|
||||
field_columns.sort_unstable_by_key(|(key, col_type, _)| (*key, *col_type));
|
||||
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
||||
for (key, bytes_or_numerical, addr) in field_columns {
|
||||
let wrt = serializer.wrt();
|
||||
let start_offset = wrt.written_bytes();
|
||||
let column_type_and_cardinality: ColumnTypeAndCardinality =
|
||||
match bytes_or_numerical {
|
||||
BytesOrNumerical::Bytes => {
|
||||
let BytesColumnWriter { dictionary_id, column_writer } =
|
||||
self.bytes_field_hash_map.read(addr);
|
||||
let dictionary_builder =
|
||||
&dictionaries[dictionary_id as usize];
|
||||
serialize_bytes_column(
|
||||
&column_writer,
|
||||
num_docs,
|
||||
dictionary_builder,
|
||||
arena,
|
||||
buffers,
|
||||
wrt,
|
||||
)?;
|
||||
ColumnTypeAndCardinality {
|
||||
cardinality: column_writer.get_cardinality(num_docs),
|
||||
typ: ColumnType::Bytes,
|
||||
}
|
||||
}
|
||||
BytesOrNumerical::Numerical => {
|
||||
let NumericalColumnWriter { compatible_numerical_types, column_writer } =
|
||||
self.numerical_field_hash_map.read(addr);
|
||||
let cardinality = column_writer.get_cardinality(num_docs);
|
||||
let numerical_type = compatible_numerical_types.to_numerical_type();
|
||||
serialize_numerical_column(
|
||||
cardinality,
|
||||
numerical_type,
|
||||
&column_writer,
|
||||
num_docs,
|
||||
arena,
|
||||
buffers,
|
||||
wrt,
|
||||
)?;
|
||||
ColumnTypeAndCardinality {
|
||||
cardinality,
|
||||
typ: ColumnType::Numerical(numerical_type),
|
||||
}
|
||||
}
|
||||
};
|
||||
let end_offset = wrt.written_bytes();
|
||||
let key_with_type = prepare_key(key, column_type_and_cardinality, &mut key_buffer);
|
||||
serializer.record_column_offsets(key_with_type, start_offset..end_offset)?;
|
||||
}
|
||||
serializer.finalize()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a key consisting of the concatenation of the key and the column_type_and_cardinality
|
||||
/// code.
|
||||
fn prepare_key<'a>(
|
||||
key: &[u8],
|
||||
column_type_cardinality: ColumnTypeAndCardinality,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> &'a [u8] {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(key);
|
||||
buffer.push(0u8);
|
||||
buffer.push(column_type_cardinality.to_code());
|
||||
&buffer[..]
|
||||
}
|
||||
|
||||
fn serialize_bytes_column<W: io::Write>(
|
||||
column_writer: &ColumnWriter,
|
||||
num_docs: DocId,
|
||||
dictionary_builder: &DictionaryBuilder,
|
||||
arena: &MemoryArena,
|
||||
buffers: &mut SpareBuffers,
|
||||
wrt: &mut CountingWriter<W>,
|
||||
) -> io::Result<()> {
|
||||
let start_offset = wrt.written_bytes();
|
||||
let id_mapping: IdMapping = dictionary_builder.serialize(wrt)?;
|
||||
let dictionary_num_bytes: u32 = (wrt.written_bytes() - start_offset) as u32;
|
||||
let cardinality = column_writer.get_cardinality(num_docs);
|
||||
let SpareBuffers {
|
||||
byte_buffer,
|
||||
value_index_builders,
|
||||
u64_values,
|
||||
..
|
||||
} = buffers;
|
||||
let symbol_iterator = column_writer
|
||||
.symbol_iterator(arena, byte_buffer)
|
||||
.map(|symbol: ColumnOperation<UnorderedId>| {
|
||||
// We map unordered ids to ordered ids.
|
||||
match symbol {
|
||||
ColumnOperation::Value(unordered_id) => {
|
||||
let ordered_id = id_mapping.to_ord(unordered_id);
|
||||
ColumnOperation::Value(ordered_id.0 as u64)
|
||||
}
|
||||
ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),
|
||||
}
|
||||
});
|
||||
serialize_column(
|
||||
symbol_iterator,
|
||||
cardinality,
|
||||
num_docs,
|
||||
value_index_builders,
|
||||
u64_values,
|
||||
wrt,
|
||||
)?;
|
||||
wrt.write_all(&dictionary_num_bytes.to_le_bytes()[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_numerical_column<W: io::Write>(
|
||||
cardinality: Cardinality,
|
||||
numerical_type: NumericalType,
|
||||
column_writer: &ColumnWriter,
|
||||
num_docs: DocId,
|
||||
arena: &MemoryArena,
|
||||
buffers: &mut SpareBuffers,
|
||||
wrt: &mut W,
|
||||
) -> io::Result<()> {
|
||||
let SpareBuffers {
|
||||
byte_buffer,
|
||||
value_index_builders,
|
||||
u64_values,
|
||||
i64_values,
|
||||
f64_values,
|
||||
} = buffers;
|
||||
let symbol_iterator = column_writer.symbol_iterator(arena, byte_buffer);
|
||||
match numerical_type {
|
||||
NumericalType::I64 => {
|
||||
serialize_column(
|
||||
coerce_numerical_symbol::<i64>(symbol_iterator),
|
||||
cardinality,
|
||||
num_docs,
|
||||
value_index_builders,
|
||||
i64_values,
|
||||
wrt,
|
||||
)?;
|
||||
}
|
||||
NumericalType::U64 => {
|
||||
serialize_column(
|
||||
coerce_numerical_symbol::<u64>(symbol_iterator),
|
||||
cardinality,
|
||||
num_docs,
|
||||
value_index_builders,
|
||||
u64_values,
|
||||
wrt,
|
||||
)?;
|
||||
}
|
||||
NumericalType::F64 => {
|
||||
serialize_column(
|
||||
coerce_numerical_symbol::<NotNan<f64>>(symbol_iterator),
|
||||
cardinality,
|
||||
num_docs,
|
||||
value_index_builders,
|
||||
f64_values,
|
||||
wrt,
|
||||
)?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_column<
|
||||
T: Copy + Ord + Default + Send + Sync + MonotonicallyMappableToU64,
|
||||
W: io::Write,
|
||||
>(
|
||||
symbol_iterator: impl Iterator<Item = ColumnOperation<T>>,
|
||||
cardinality: Cardinality,
|
||||
num_docs: DocId,
|
||||
value_index_builders: &mut SpareIndexBuilders,
|
||||
values: &mut Vec<T>,
|
||||
wrt: &mut W,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
for<'a> VecColumn<'a, T>: Column<T>,
|
||||
{
|
||||
match cardinality {
|
||||
Cardinality::Required => {
|
||||
consume_symbol_iterator(
|
||||
symbol_iterator,
|
||||
value_index_builders.borrow_required_index_builder(),
|
||||
values,
|
||||
);
|
||||
fastfield_codecs::serialize(
|
||||
VecColumn::from(&values[..]),
|
||||
wrt,
|
||||
&fastfield_codecs::ALL_CODEC_TYPES[..],
|
||||
)?;
|
||||
}
|
||||
Cardinality::Optional => {
|
||||
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
||||
consume_symbol_iterator(symbol_iterator, optional_index_builder, values);
|
||||
let optional_index = optional_index_builder.finish(num_docs);
|
||||
fastfield_codecs::serialize::serialize_new(
|
||||
ValueIndexInfo::SingleValue(Box::new(optional_index)),
|
||||
VecColumn::from(&values[..]),
|
||||
wrt,
|
||||
&fastfield_codecs::ALL_CODEC_TYPES[..],
|
||||
)?;
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||
consume_symbol_iterator(symbol_iterator, multivalued_index_builder, values);
|
||||
let multivalued_index = multivalued_index_builder.finish(num_docs);
|
||||
fastfield_codecs::serialize::serialize_new(
|
||||
ValueIndexInfo::MultiValue(Box::new(multivalued_index)),
|
||||
VecColumn::from(&values[..]),
|
||||
wrt,
|
||||
&fastfield_codecs::ALL_CODEC_TYPES[..],
|
||||
)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn coerce_numerical_symbol<T>(
|
||||
symbol_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
|
||||
) -> impl Iterator<Item = ColumnOperation<T>>
|
||||
where T: Coerce {
|
||||
symbol_iterator.map(|symbol| match symbol {
|
||||
ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),
|
||||
ColumnOperation::Value(numerical_value) => {
|
||||
ColumnOperation::Value(Coerce::coerce(numerical_value))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn consume_symbol_iterator<T, TIndexBuilder: IndexBuilder>(
|
||||
symbol_iterator: impl Iterator<Item = ColumnOperation<T>>,
|
||||
index_builder: &mut TIndexBuilder,
|
||||
values: &mut Vec<T>,
|
||||
) {
|
||||
for symbol in symbol_iterator {
|
||||
match symbol {
|
||||
ColumnOperation::NewDoc(doc) => {
|
||||
index_builder.record_doc(doc);
|
||||
}
|
||||
ColumnOperation::Value(value) => {
|
||||
index_builder.record_value();
|
||||
values.push(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use ordered_float::NotNan;
|
||||
use stacker::MemoryArena;
|
||||
|
||||
use super::prepare_key;
|
||||
use crate::column_type_header::{ColumnType, ColumnTypeAndCardinality};
|
||||
use crate::value::{NumericalType, NumericalValue};
|
||||
use crate::writer::column_operation::ColumnOperation;
|
||||
use crate::writer::CompatibleNumericalTypes;
|
||||
use crate::Cardinality;
|
||||
|
||||
#[test]
|
||||
fn test_prepare_key_bytes() {
|
||||
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
|
||||
let column_type_and_cardinality = ColumnTypeAndCardinality {
|
||||
typ: ColumnType::Bytes,
|
||||
cardinality: Cardinality::Optional,
|
||||
};
|
||||
let prepared_key = prepare_key(b"root\0child", column_type_and_cardinality, &mut buffer);
|
||||
assert_eq!(prepared_key.len(), 12);
|
||||
assert_eq!(&prepared_key[..10], b"root\0child");
|
||||
assert_eq!(prepared_key[10], 0u8);
|
||||
assert_eq!(prepared_key[11], column_type_and_cardinality.to_code());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_column_writer_required_simple() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut column_writer = super::ColumnWriter::default();
|
||||
column_writer.record(0u32, 14i64.into(), &mut arena);
|
||||
column_writer.record(1u32, 15i64.into(), &mut arena);
|
||||
column_writer.record(2u32, (-16i64).into(), &mut arena);
|
||||
assert_eq!(column_writer.get_cardinality(3), Cardinality::Required);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.symbol_iterator(&mut arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 6);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
assert!(matches!(
|
||||
symbols[1],
|
||||
ColumnOperation::Value(NumericalValue::I64(14i64))
|
||||
));
|
||||
assert!(matches!(symbols[2], ColumnOperation::NewDoc(1u32)));
|
||||
assert!(matches!(
|
||||
symbols[3],
|
||||
ColumnOperation::Value(NumericalValue::I64(15i64))
|
||||
));
|
||||
assert!(matches!(symbols[4], ColumnOperation::NewDoc(2u32)));
|
||||
assert!(matches!(
|
||||
symbols[5],
|
||||
ColumnOperation::Value(NumericalValue::I64(-16i64))
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_column_writer_optional_cardinality_missing_first() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut column_writer = super::ColumnWriter::default();
|
||||
column_writer.record(1u32, 15i64.into(), &mut arena);
|
||||
column_writer.record(2u32, (-16i64).into(), &mut arena);
|
||||
assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.symbol_iterator(&mut arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 4);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
|
||||
assert!(matches!(
|
||||
symbols[1],
|
||||
ColumnOperation::Value(NumericalValue::I64(15i64))
|
||||
));
|
||||
assert!(matches!(symbols[2], ColumnOperation::NewDoc(2u32)));
|
||||
assert!(matches!(
|
||||
symbols[3],
|
||||
ColumnOperation::Value(NumericalValue::I64(-16i64))
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_column_writer_optional_cardinality_missing_last() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut column_writer = super::ColumnWriter::default();
|
||||
column_writer.record(0u32, 15i64.into(), &mut arena);
|
||||
assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.symbol_iterator(&mut arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 2);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
assert!(matches!(
|
||||
symbols[1],
|
||||
ColumnOperation::Value(NumericalValue::I64(15i64))
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_column_writer_multivalued() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut column_writer = super::ColumnWriter::default();
|
||||
column_writer.record(0u32, 16i64.into(), &mut arena);
|
||||
column_writer.record(0u32, 17i64.into(), &mut arena);
|
||||
assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.symbol_iterator(&mut arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 3);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
assert!(matches!(
|
||||
symbols[1],
|
||||
ColumnOperation::Value(NumericalValue::I64(16i64))
|
||||
));
|
||||
assert!(matches!(
|
||||
symbols[2],
|
||||
ColumnOperation::Value(NumericalValue::I64(17i64))
|
||||
));
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn test_column_writer_coercion_iter_aux(
|
||||
values: impl Iterator<Item = NumericalValue>,
|
||||
expected_numerical_type: NumericalType,
|
||||
) {
|
||||
let mut compatible_numerical_types = CompatibleNumericalTypes::default();
|
||||
for value in values {
|
||||
compatible_numerical_types.accept_value(value);
|
||||
}
|
||||
assert_eq!(
|
||||
compatible_numerical_types.to_numerical_type(),
|
||||
expected_numerical_type
|
||||
);
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn test_column_writer_coercion_aux(
|
||||
values: &[NumericalValue],
|
||||
expected_numerical_type: NumericalType,
|
||||
) {
|
||||
test_column_writer_coercion_iter_aux(values.iter().copied(), expected_numerical_type);
|
||||
test_column_writer_coercion_iter_aux(values.iter().rev().copied(), expected_numerical_type);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_column_writer_coercion() {
|
||||
test_column_writer_coercion_aux(&[], NumericalType::I64);
|
||||
test_column_writer_coercion_aux(&[1i64.into()], NumericalType::I64);
|
||||
test_column_writer_coercion_aux(&[1u64.into()], NumericalType::I64);
|
||||
// We don't detect exact integer at the moment. We could!
|
||||
test_column_writer_coercion_aux(&[NotNan::new(1f64).unwrap().into()], NumericalType::F64);
|
||||
test_column_writer_coercion_aux(&[u64::MAX.into()], NumericalType::U64);
|
||||
test_column_writer_coercion_aux(&[(i64::MAX as u64).into()], NumericalType::U64);
|
||||
test_column_writer_coercion_aux(&[(1u64 << 63).into()], NumericalType::U64);
|
||||
test_column_writer_coercion_aux(&[1i64.into(), 1u64.into()], NumericalType::I64);
|
||||
test_column_writer_coercion_aux(&[u64::MAX.into(), (-1i64).into()], NumericalType::F64);
|
||||
}
|
||||
}
|
||||
218
columnar/src/writer/value_index.rs
Normal file
218
columnar/src/writer/value_index.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
use fastfield_codecs::serialize::{MultiValueIndexInfo, SingleValueIndexInfo};
|
||||
|
||||
use crate::DocId;
|
||||
|
||||
/// The `IndexBuilder` interprets a sequence of
|
||||
/// calls of the form:
|
||||
/// (record_doc,record_value+)*
|
||||
/// and can then serialize the results into an index.
|
||||
///
|
||||
/// It has different implementation depending on whether the
|
||||
/// cardinality is required, optional, or multivalued.
|
||||
pub(crate) trait IndexBuilder {
|
||||
fn record_doc(&mut self, doc: DocId);
|
||||
#[inline]
|
||||
fn record_value(&mut self) {}
|
||||
}
|
||||
|
||||
/// The RequiredIndexBuilder does nothing.
|
||||
#[derive(Default)]
|
||||
pub struct RequiredIndexBuilder;
|
||||
|
||||
impl IndexBuilder for RequiredIndexBuilder {
|
||||
#[inline(always)]
|
||||
fn record_doc(&mut self, _doc: DocId) {}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct OptionalIndexBuilder {
|
||||
docs: Vec<DocId>,
|
||||
}
|
||||
|
||||
struct SingleValueArrayIndex<'a> {
|
||||
docs: &'a [DocId],
|
||||
num_docs: DocId,
|
||||
}
|
||||
|
||||
impl<'a> SingleValueIndexInfo for SingleValueArrayIndex<'a> {
|
||||
fn num_vals(&self) -> u32 {
|
||||
self.num_docs as u32
|
||||
}
|
||||
|
||||
fn num_non_nulls(&self) -> u32 {
|
||||
self.docs.len() as u32
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new(self.docs.iter().copied())
|
||||
}
|
||||
}
|
||||
|
||||
impl OptionalIndexBuilder {
|
||||
pub fn finish(&mut self, num_docs: DocId) -> impl SingleValueIndexInfo + '_ {
|
||||
debug_assert!(self
|
||||
.docs
|
||||
.last()
|
||||
.copied()
|
||||
.map(|last_doc| last_doc < num_docs)
|
||||
.unwrap_or(true));
|
||||
SingleValueArrayIndex {
|
||||
docs: &self.docs[..],
|
||||
num_docs,
|
||||
}
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.docs.clear();
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexBuilder for OptionalIndexBuilder {
|
||||
#[inline(always)]
|
||||
fn record_doc(&mut self, doc: DocId) {
|
||||
debug_assert!(self
|
||||
.docs
|
||||
.last()
|
||||
.copied()
|
||||
.map(|prev_doc| doc > prev_doc)
|
||||
.unwrap_or(true));
|
||||
self.docs.push(doc);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MultivaluedIndexBuilder {
|
||||
// TODO should we switch to `start_offset`?
|
||||
end_values: Vec<DocId>,
|
||||
total_num_vals_seen: u32,
|
||||
}
|
||||
|
||||
pub struct MultivaluedValueArrayIndex<'a> {
|
||||
end_offsets: &'a [DocId],
|
||||
}
|
||||
|
||||
impl<'a> MultiValueIndexInfo for MultivaluedValueArrayIndex<'a> {
|
||||
fn num_docs(&self) -> u32 {
|
||||
self.end_offsets.len() as u32
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u32 {
|
||||
self.end_offsets.last().copied().unwrap_or(0u32)
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
if self.end_offsets.is_empty() {
|
||||
return Box::new(std::iter::empty());
|
||||
}
|
||||
let n = self.end_offsets.len();
|
||||
Box::new(std::iter::once(0u32).chain(self.end_offsets[..n - 1].iter().copied()))
|
||||
}
|
||||
}
|
||||
|
||||
impl MultivaluedIndexBuilder {
|
||||
pub fn finish(&mut self, num_docs: DocId) -> impl MultiValueIndexInfo + '_ {
|
||||
self.end_values
|
||||
.resize(num_docs as usize, self.total_num_vals_seen);
|
||||
MultivaluedValueArrayIndex {
|
||||
end_offsets: &self.end_values[..],
|
||||
}
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.end_values.clear();
|
||||
self.total_num_vals_seen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexBuilder for MultivaluedIndexBuilder {
|
||||
fn record_doc(&mut self, doc: DocId) {
|
||||
self.end_values
|
||||
.resize(doc as usize, self.total_num_vals_seen);
|
||||
}
|
||||
|
||||
fn record_value(&mut self) {
|
||||
self.total_num_vals_seen += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// The `SpareIndexBuilders` is there to avoid allocating a
|
||||
/// new index builder for every single column.
|
||||
#[derive(Default)]
|
||||
pub struct SpareIndexBuilders {
|
||||
required_index_builder: RequiredIndexBuilder,
|
||||
optional_index_builder: OptionalIndexBuilder,
|
||||
multivalued_index_builder: MultivaluedIndexBuilder,
|
||||
}
|
||||
|
||||
impl SpareIndexBuilders {
|
||||
pub fn borrow_required_index_builder(&mut self) -> &mut RequiredIndexBuilder {
|
||||
&mut self.required_index_builder
|
||||
}
|
||||
|
||||
pub fn borrow_optional_index_builder(&mut self) -> &mut OptionalIndexBuilder {
|
||||
self.optional_index_builder.reset();
|
||||
&mut self.optional_index_builder
|
||||
}
|
||||
|
||||
pub fn borrow_multivalued_index_builder(&mut self) -> &mut MultivaluedIndexBuilder {
|
||||
self.multivalued_index_builder.reset();
|
||||
&mut self.multivalued_index_builder
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_optional_value_index_builder() {
|
||||
let mut opt_value_index_builder = OptionalIndexBuilder::default();
|
||||
opt_value_index_builder.record_doc(0u32);
|
||||
opt_value_index_builder.record_value();
|
||||
assert_eq!(
|
||||
&opt_value_index_builder
|
||||
.finish(1u32)
|
||||
.iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
&[0]
|
||||
);
|
||||
opt_value_index_builder.reset();
|
||||
opt_value_index_builder.record_doc(1u32);
|
||||
opt_value_index_builder.record_value();
|
||||
assert_eq!(
|
||||
&opt_value_index_builder
|
||||
.finish(2u32)
|
||||
.iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
&[1]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_value_index_builder() {
|
||||
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
|
||||
multivalued_value_index_builder.record_doc(1u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_doc(2u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
assert_eq!(
|
||||
multivalued_value_index_builder
|
||||
.finish(4u32)
|
||||
.iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
vec![0, 0, 2, 3]
|
||||
);
|
||||
multivalued_value_index_builder.reset();
|
||||
multivalued_value_index_builder.record_doc(2u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_value();
|
||||
assert_eq!(
|
||||
multivalued_value_index_builder
|
||||
.finish(4u32)
|
||||
.iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
vec![0, 0, 0, 2]
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.4.0"
|
||||
version = "0.5.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
@@ -14,7 +14,8 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version= "0.4", path="../ownedbytes" }
|
||||
ownedbytes = { version= "0.5", path="../ownedbytes" }
|
||||
async-trait = "0.1"
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
|
||||
@@ -1,19 +1,18 @@
|
||||
use std::ops::{Deref, Range};
|
||||
use std::ops::{Deref, Range, RangeBounds};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common::HasLen;
|
||||
use stable_deref_trait::StableDeref;
|
||||
use ownedbytes::{OwnedBytes, StableDeref};
|
||||
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::HasLen;
|
||||
|
||||
/// Objects that represents files sections in tantivy.
|
||||
///
|
||||
/// By contract, whatever happens to the directory file, as long as a FileHandle
|
||||
/// is alive, the data associated with it cannot be altered or destroyed.
|
||||
///
|
||||
/// The underlying behavior is therefore specific to the [`Directory`](crate::Directory) that
|
||||
/// The underlying behavior is therefore specific to the `Directory` that
|
||||
/// created it. Despite its name, a [`FileSlice`] may or may not directly map to an actual file
|
||||
/// on the filesystem.
|
||||
|
||||
@@ -24,13 +23,9 @@ pub trait FileHandle: 'static + Send + Sync + HasLen + fmt::Debug {
|
||||
/// This method may panic if the range requested is invalid.
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes>;
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[doc(hidden)]
|
||||
async fn read_bytes_async(
|
||||
&self,
|
||||
_byte_range: Range<usize>,
|
||||
) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
Err(crate::error::AsyncIoError::AsyncUnsupported)
|
||||
async fn read_bytes_async(&self, byte_range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
self.read_bytes(byte_range)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,7 +37,7 @@ impl FileHandle for &'static [u8] {
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
async fn read_bytes_async(&self, byte_range: Range<usize>) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
async fn read_bytes_async(&self, byte_range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
Ok(self.read_bytes(byte_range)?)
|
||||
}
|
||||
}
|
||||
@@ -70,6 +65,25 @@ impl fmt::Debug for FileSlice {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn combine_ranges<R: RangeBounds<usize>>(orig_range: Range<usize>, rel_range: R) -> Range<usize> {
|
||||
let start: usize = orig_range.start
|
||||
+ match rel_range.start_bound().cloned() {
|
||||
std::ops::Bound::Included(rel_start) => rel_start,
|
||||
std::ops::Bound::Excluded(rel_start) => rel_start + 1,
|
||||
std::ops::Bound::Unbounded => 0,
|
||||
};
|
||||
assert!(start <= orig_range.end);
|
||||
let end: usize = match rel_range.end_bound().cloned() {
|
||||
std::ops::Bound::Included(rel_end) => orig_range.start + rel_end + 1,
|
||||
std::ops::Bound::Excluded(rel_end) => orig_range.start + rel_end,
|
||||
std::ops::Bound::Unbounded => orig_range.end,
|
||||
};
|
||||
assert!(end >= start);
|
||||
assert!(end <= orig_range.end);
|
||||
start..end
|
||||
}
|
||||
|
||||
impl FileSlice {
|
||||
/// Wraps a FileHandle.
|
||||
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
|
||||
@@ -93,11 +107,11 @@ impl FileSlice {
|
||||
///
|
||||
/// Panics if `byte_range.end` exceeds the filesize.
|
||||
#[must_use]
|
||||
pub fn slice(&self, byte_range: Range<usize>) -> FileSlice {
|
||||
assert!(byte_range.end <= self.len());
|
||||
#[inline]
|
||||
pub fn slice<R: RangeBounds<usize>>(&self, byte_range: R) -> FileSlice {
|
||||
FileSlice {
|
||||
data: self.data.clone(),
|
||||
range: self.range.start + byte_range.start..self.range.start + byte_range.end,
|
||||
range: combine_ranges(self.range.clone(), byte_range),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,9 +131,8 @@ impl FileSlice {
|
||||
self.data.read_bytes(self.range.clone())
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[doc(hidden)]
|
||||
pub async fn read_bytes_async(&self) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
pub async fn read_bytes_async(&self) -> io::Result<OwnedBytes> {
|
||||
self.data.read_bytes_async(self.range.clone()).await
|
||||
}
|
||||
|
||||
@@ -137,12 +150,8 @@ impl FileSlice {
|
||||
.read_bytes(self.range.start + range.start..self.range.start + range.end)
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[doc(hidden)]
|
||||
pub async fn read_bytes_slice_async(
|
||||
&self,
|
||||
byte_range: Range<usize>,
|
||||
) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
pub async fn read_bytes_slice_async(&self, byte_range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
assert!(
|
||||
self.range.start + byte_range.end <= self.range.end,
|
||||
"`to` exceeds the fileslice length"
|
||||
@@ -205,7 +214,7 @@ impl FileHandle for FileSlice {
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
async fn read_bytes_async(&self, byte_range: Range<usize>) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
async fn read_bytes_async(&self, byte_range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
self.read_bytes_slice_async(byte_range).await
|
||||
}
|
||||
}
|
||||
@@ -223,7 +232,7 @@ impl FileHandle for OwnedBytes {
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
async fn read_bytes_async(&self, range: Range<usize>) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
async fn read_bytes_async(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
let bytes = self.read_bytes(range)?;
|
||||
Ok(bytes)
|
||||
}
|
||||
@@ -234,9 +243,9 @@ mod tests {
|
||||
use std::io;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::HasLen;
|
||||
|
||||
use super::{FileHandle, FileSlice};
|
||||
use crate::file_slice::combine_ranges;
|
||||
use crate::HasLen;
|
||||
|
||||
#[test]
|
||||
fn test_file_slice() -> io::Result<()> {
|
||||
@@ -307,4 +316,18 @@ mod tests {
|
||||
b"bcd"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_combine_range() {
|
||||
assert_eq!(combine_ranges(1..3, 0..1), 1..2);
|
||||
assert_eq!(combine_ranges(1..3, 1..), 2..3);
|
||||
assert_eq!(combine_ranges(1..4, ..2), 1..3);
|
||||
assert_eq!(combine_ranges(3..10, 2..5), 5..8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_combine_range_panics() {
|
||||
let _ = combine_ranges(3..5, 1..4);
|
||||
}
|
||||
}
|
||||
@@ -5,11 +5,12 @@ use std::ops::Deref;
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod bitset;
|
||||
pub mod file_slice;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod writer;
|
||||
|
||||
pub use bitset::*;
|
||||
pub use ownedbytes::OwnedBytes;
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{
|
||||
deserialize_vint_u128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u128,
|
||||
|
||||
@@ -12,15 +12,16 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { version = "0.4", path = "../common/", package = "tantivy-common" }
|
||||
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
|
||||
ownedbytes = { version = "0.4.0", path = "../ownedbytes" }
|
||||
ownedbytes = { version = "0.5", path = "../ownedbytes" }
|
||||
prettytable-rs = {version="0.9.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
fastdivide = "0.4"
|
||||
log = "0.4"
|
||||
itertools = { version = "0.10.3" }
|
||||
measure_time = { version="0.8.2", optional=true}
|
||||
ordered-float = "3.4"
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.3.0"
|
||||
|
||||
@@ -43,7 +43,9 @@ mod null_index_footer;
|
||||
|
||||
mod column;
|
||||
mod gcd;
|
||||
mod serialize;
|
||||
pub mod serialize;
|
||||
|
||||
pub use ordered_float;
|
||||
|
||||
use self::bitpacked::BitpackedCodec;
|
||||
use self::blockwise_linear::BlockwiseLinearCodec;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use fastdivide::DividerU64;
|
||||
use ordered_float::NotNan;
|
||||
|
||||
use crate::MonotonicallyMappableToU128;
|
||||
|
||||
@@ -192,6 +193,8 @@ impl MonotonicallyMappableToU64 for bool {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO remove me.
|
||||
// Tantivy should refuse NaN values and work with NotNaN internally.
|
||||
impl MonotonicallyMappableToU64 for f64 {
|
||||
fn to_u64(self) -> u64 {
|
||||
common::f64_to_u64(self)
|
||||
@@ -202,11 +205,42 @@ impl MonotonicallyMappableToU64 for f64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl MonotonicallyMappableToU64 for ordered_float::NotNan<f64> {
|
||||
fn to_u64(self) -> u64 {
|
||||
common::f64_to_u64(self.into_inner())
|
||||
}
|
||||
|
||||
fn from_u64(val: u64) -> Self {
|
||||
NotNan::new(common::u64_to_f64(val)).expect("Invalid NotNaN f64 value.")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_from_u64_pos_inf() {
|
||||
let inf_as_u64 = common::f64_to_u64(f64::INFINITY);
|
||||
let inf_back_to_f64 = NotNan::from_u64(inf_as_u64);
|
||||
assert_eq!(inf_back_to_f64, NotNan::new(f64::INFINITY).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_u64_neg_inf() {
|
||||
let inf_as_u64 = common::f64_to_u64(-f64::INFINITY);
|
||||
let inf_back_to_f64 = NotNan::from_u64(inf_as_u64);
|
||||
assert_eq!(inf_back_to_f64, NotNan::new(-f64::INFINITY).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Invalid NotNaN")]
|
||||
fn test_from_u64_nan_panics() {
|
||||
let nan_as_u64 = common::f64_to_u64(f64::NAN);
|
||||
NotNan::from_u64(nan_as_u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strictly_monotonic_test() {
|
||||
// identity mapping
|
||||
|
||||
@@ -160,7 +160,7 @@ fn deserialize_sparse_codec_block(data: &OwnedBytes) -> Vec<SparseCodecBlockVari
|
||||
// The number of vals so far
|
||||
let mut offset = 0;
|
||||
let mut sparse_codec_blocks = Vec::new();
|
||||
let num_blocks = get_u16(&data, data.len() - 2);
|
||||
let num_blocks = get_u16(data, data.len() - 2);
|
||||
let block_data_index_start =
|
||||
data.len() - 2 - num_blocks as usize * SERIALIZED_BLOCK_METADATA_SIZE;
|
||||
let mut byte_start = 0;
|
||||
|
||||
@@ -1,22 +1,3 @@
|
||||
// Copyright (C) 2022 Quickwit, Inc.
|
||||
//
|
||||
// Quickwit is offered under the AGPL v3.0 and as commercial software.
|
||||
// For commercial licensing, contact us at hello@quickwit.io.
|
||||
//
|
||||
// AGPL:
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use std::io;
|
||||
use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
@@ -198,12 +179,12 @@ pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub enum ValueIndexInfo {
|
||||
MultiValue(Box<dyn MultiValueIndexInfo>),
|
||||
SingleValue(Box<dyn SingleValueIndexInfo>),
|
||||
pub enum ValueIndexInfo<'a> {
|
||||
MultiValue(Box<dyn MultiValueIndexInfo + 'a>),
|
||||
SingleValue(Box<dyn SingleValueIndexInfo + 'a>),
|
||||
}
|
||||
|
||||
impl Default for ValueIndexInfo {
|
||||
impl Default for ValueIndexInfo<'static> {
|
||||
fn default() -> Self {
|
||||
struct Dummy {}
|
||||
impl SingleValueIndexInfo for Dummy {
|
||||
@@ -222,7 +203,7 @@ impl Default for ValueIndexInfo {
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueIndexInfo {
|
||||
impl<'a> ValueIndexInfo<'a> {
|
||||
fn get_cardinality(&self) -> FastFieldCardinality {
|
||||
match self {
|
||||
ValueIndexInfo::MultiValue(_) => FastFieldCardinality::Multi,
|
||||
@@ -237,7 +218,7 @@ pub trait MultiValueIndexInfo {
|
||||
/// The number of values in the column.
|
||||
fn num_vals(&self) -> u32;
|
||||
/// Return the start index of the values for each doc
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u32>>;
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u32> + '_>;
|
||||
}
|
||||
|
||||
pub trait SingleValueIndexInfo {
|
||||
@@ -246,7 +227,7 @@ pub trait SingleValueIndexInfo {
|
||||
/// The number of non-null values in the column.
|
||||
fn num_non_nulls(&self) -> u32;
|
||||
/// Return a iterator of the positions of docs with a value
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u32>>;
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u32> + '_>;
|
||||
}
|
||||
|
||||
/// Serializes u128 values with the compact space codec.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.4.0"
|
||||
version = "0.5.0"
|
||||
edition = "2021"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io, mem};
|
||||
|
||||
use stable_deref_trait::StableDeref;
|
||||
pub use stable_deref_trait::StableDeref;
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a slice.
|
||||
|
||||
@@ -200,10 +200,7 @@ impl InvertedIndexReader {
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) async fn get_term_info_async(
|
||||
&self,
|
||||
term: &Term,
|
||||
) -> crate::AsyncIoResult<Option<TermInfo>> {
|
||||
pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result<Option<TermInfo>> {
|
||||
self.termdict.get_async(term.value_bytes()).await
|
||||
}
|
||||
|
||||
@@ -211,12 +208,8 @@ impl InvertedIndexReader {
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most users should prefer using [`Self::read_postings()`] instead.
|
||||
pub async fn warm_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
with_positions: bool,
|
||||
) -> crate::AsyncIoResult<()> {
|
||||
let term_info_opt = self.get_term_info_async(term).await?;
|
||||
pub async fn warm_postings(&self, term: &Term, with_positions: bool) -> io::Result<()> {
|
||||
let term_info_opt: Option<TermInfo> = self.get_term_info_async(term).await?;
|
||||
if let Some(term_info) = term_info_opt {
|
||||
self.postings_file_slice
|
||||
.read_bytes_slice_async(term_info.postings_range.clone())
|
||||
@@ -234,7 +227,7 @@ impl InvertedIndexReader {
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// If you know which terms to pre-load, prefer using [`Self::warm_postings`] instead.
|
||||
pub async fn warm_postings_full(&self, with_positions: bool) -> crate::AsyncIoResult<()> {
|
||||
pub async fn warm_postings_full(&self, with_positions: bool) -> io::Result<()> {
|
||||
self.postings_file_slice.read_bytes_async().await?;
|
||||
if with_positions {
|
||||
self.positions_file_slice.read_bytes_async().await?;
|
||||
@@ -243,7 +236,7 @@ impl InvertedIndexReader {
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term asynchronously.
|
||||
pub async fn doc_freq_async(&self, term: &Term) -> crate::AsyncIoResult<u32> {
|
||||
pub async fn doc_freq_async(&self, term: &Term) -> io::Result<u32> {
|
||||
Ok(self
|
||||
.get_term_info_async(term)
|
||||
.await?
|
||||
|
||||
@@ -5,7 +5,6 @@ mod mmap_directory;
|
||||
|
||||
mod directory;
|
||||
mod directory_lock;
|
||||
mod file_slice;
|
||||
mod file_watcher;
|
||||
mod footer;
|
||||
mod managed_directory;
|
||||
@@ -20,13 +19,13 @@ mod composite_file;
|
||||
use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub use common::file_slice::{FileHandle, FileSlice};
|
||||
pub use common::{AntiCallToken, TerminatingWrite};
|
||||
pub use ownedbytes::OwnedBytes;
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::directory::{Directory, DirectoryClone, DirectoryLock};
|
||||
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
pub use self::file_slice::{FileHandle, FileSlice};
|
||||
pub use self::ram_directory::RamDirectory;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
|
||||
|
||||
22
src/error.rs
22
src/error.rs
@@ -104,28 +104,6 @@ pub enum TantivyError {
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[derive(Error, Debug)]
|
||||
#[doc(hidden)]
|
||||
pub enum AsyncIoError {
|
||||
#[error("io::Error `{0}`")]
|
||||
Io(#[from] io::Error),
|
||||
#[error("Asynchronous API is unsupported by this directory")]
|
||||
AsyncUnsupported,
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
impl From<AsyncIoError> for TantivyError {
|
||||
fn from(async_io_err: AsyncIoError) -> Self {
|
||||
match async_io_err {
|
||||
AsyncIoError::Io(io_err) => TantivyError::from(io_err),
|
||||
AsyncIoError::AsyncUnsupported => {
|
||||
TantivyError::SystemError(format!("{:?}", async_io_err))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for TantivyError {
|
||||
fn from(io_err: io::Error) -> TantivyError {
|
||||
TantivyError::IoError(Arc::new(io_err))
|
||||
|
||||
@@ -259,10 +259,6 @@ pub use crate::future_result::FutureResult;
|
||||
/// and instead, refer to this as `crate::Result<T>`.
|
||||
pub type Result<T> = std::result::Result<T, TantivyError>;
|
||||
|
||||
/// Result for an Async io operation.
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub type AsyncIoResult<T> = std::result::Result<T, crate::error::AsyncIoError>;
|
||||
|
||||
mod core;
|
||||
mod indexer;
|
||||
|
||||
|
||||
@@ -213,21 +213,21 @@ impl<'a> FieldSerializer<'a> {
|
||||
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
|
||||
Err(io::Error::new(io::ErrorKind::Other, format!("{:?}", msg)))
|
||||
});
|
||||
if self.term_open {
|
||||
self.postings_serializer
|
||||
.close_term(self.current_term_info.doc_freq)?;
|
||||
self.current_term_info.postings_range.end =
|
||||
self.postings_serializer.written_bytes() as usize;
|
||||
|
||||
if let Some(positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
positions_serializer.close_term()?;
|
||||
self.current_term_info.positions_range.end =
|
||||
positions_serializer.written_bytes() as usize;
|
||||
}
|
||||
self.term_dictionary_builder
|
||||
.insert_value(&self.current_term_info)?;
|
||||
self.term_open = false;
|
||||
if !self.term_open {
|
||||
return Ok(());
|
||||
}
|
||||
self.postings_serializer
|
||||
.close_term(self.current_term_info.doc_freq)?;
|
||||
self.current_term_info.postings_range.end =
|
||||
self.postings_serializer.written_bytes() as usize;
|
||||
if let Some(positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
positions_serializer.close_term()?;
|
||||
self.current_term_info.positions_range.end =
|
||||
positions_serializer.written_bytes() as usize;
|
||||
}
|
||||
self.term_dictionary_builder
|
||||
.insert_value(&self.current_term_info)?;
|
||||
self.term_open = false;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ impl VecCursor {
|
||||
}
|
||||
#[inline]
|
||||
fn current(&self) -> Option<u32> {
|
||||
self.docs.get(self.current_pos).map(|el| *el)
|
||||
self.docs.get(self.current_pos).copied()
|
||||
}
|
||||
fn get_cleared_data(&mut self) -> &mut Vec<u32> {
|
||||
self.docs.clear();
|
||||
@@ -282,7 +282,6 @@ impl DocSet for IpRangeDocSet {
|
||||
fn doc(&self) -> DocId {
|
||||
self.loaded_docs
|
||||
.current()
|
||||
.map(|el| el)
|
||||
.unwrap_or(TERMINATED)
|
||||
}
|
||||
|
||||
|
||||
@@ -319,7 +319,7 @@ impl StoreReader {
|
||||
/// In most cases use [`get_async`](Self::get_async)
|
||||
///
|
||||
/// Loads and decompresses a block asynchronously.
|
||||
async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult<Block> {
|
||||
async fn read_block_async(&self, checkpoint: &Checkpoint) -> io::Result<Block> {
|
||||
let cache_key = checkpoint.byte_range.start;
|
||||
if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) {
|
||||
return Ok(block);
|
||||
|
||||
@@ -121,7 +121,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
|
||||
}
|
||||
|
||||
impl TermInfoStore {
|
||||
pub fn open(term_info_store_file: FileSlice) -> crate::Result<TermInfoStore> {
|
||||
pub fn open(term_info_store_file: FileSlice) -> io::Result<TermInfoStore> {
|
||||
let (len_slice, main_slice) = term_info_store_file.split(16);
|
||||
let mut bytes = len_slice.read_bytes()?;
|
||||
let len = u64::deserialize(&mut bytes)? as usize;
|
||||
|
||||
@@ -8,7 +8,6 @@ use tantivy_fst::Automaton;
|
||||
use super::term_info_store::{TermInfoStore, TermInfoStoreWriter};
|
||||
use super::{TermStreamer, TermStreamerBuilder};
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::TermOrdinal;
|
||||
|
||||
@@ -55,7 +54,7 @@ where W: Write
|
||||
/// to insert_key and insert_value.
|
||||
///
|
||||
/// Prefer using `.insert(key, value)`
|
||||
pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
|
||||
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
|
||||
self.fst_builder
|
||||
.insert(key, self.term_ord)
|
||||
.map_err(convert_fst_error)?;
|
||||
@@ -66,7 +65,7 @@ where W: Write
|
||||
/// # Warning
|
||||
///
|
||||
/// Horribly dangerous internal API. See `.insert_key(...)`.
|
||||
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
||||
pub fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
||||
self.term_info_store_writer.write_term_info(term_info)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -86,10 +85,14 @@ where W: Write
|
||||
}
|
||||
}
|
||||
|
||||
fn open_fst_index(fst_file: FileSlice) -> crate::Result<tantivy_fst::Map<OwnedBytes>> {
|
||||
fn open_fst_index(fst_file: FileSlice) -> io::Result<tantivy_fst::Map<OwnedBytes>> {
|
||||
let bytes = fst_file.read_bytes()?;
|
||||
let fst = Fst::new(bytes)
|
||||
.map_err(|err| DataCorruption::comment_only(format!("Fst data is corrupted: {:?}", err)))?;
|
||||
let fst = Fst::new(bytes).map_err(|err| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("Fst data is corrupted: {:?}", err),
|
||||
)
|
||||
})?;
|
||||
Ok(tantivy_fst::Map::from(fst))
|
||||
}
|
||||
|
||||
@@ -114,7 +117,7 @@ pub struct TermDictionary {
|
||||
|
||||
impl TermDictionary {
|
||||
/// Opens a `TermDictionary`.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
pub fn open(file: FileSlice) -> io::Result<Self> {
|
||||
let (main_slice, footer_len_slice) = file.split_from_end(8);
|
||||
let mut footer_len_bytes = footer_len_slice.read_bytes()?;
|
||||
let footer_size = u64::deserialize(&mut footer_len_bytes)?;
|
||||
|
||||
@@ -1,26 +1,28 @@
|
||||
use std::io;
|
||||
|
||||
mod merger;
|
||||
mod streamer;
|
||||
mod termdict;
|
||||
|
||||
use std::iter::ExactSizeIterator;
|
||||
|
||||
use common::VInt;
|
||||
use sstable::value::{ValueReader, ValueWriter};
|
||||
use sstable::{BlockReader, SSTable};
|
||||
use sstable::SSTable;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
|
||||
pub use self::merger::TermMerger;
|
||||
pub use self::streamer::{TermStreamer, TermStreamerBuilder};
|
||||
pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
|
||||
use crate::postings::TermInfo;
|
||||
|
||||
pub type TermDictionary = sstable::Dictionary<TermSSTable>;
|
||||
pub type TermDictionaryBuilder<W> = sstable::Writer<W, TermInfoWriter>;
|
||||
pub type TermStreamer<'a, A = AlwaysMatch> = sstable::Streamer<'a, TermSSTable, A>;
|
||||
|
||||
pub struct TermSSTable;
|
||||
|
||||
impl SSTable for TermSSTable {
|
||||
type Value = TermInfo;
|
||||
type Reader = TermInfoReader;
|
||||
type Writer = TermInfoWriter;
|
||||
type ValueReader = TermInfoReader;
|
||||
type ValueWriter = TermInfoWriter;
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -35,15 +37,16 @@ impl ValueReader for TermInfoReader {
|
||||
&self.term_infos[idx]
|
||||
}
|
||||
|
||||
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()> {
|
||||
fn load(&mut self, mut data: &[u8]) -> io::Result<usize> {
|
||||
let len_before = data.len();
|
||||
self.term_infos.clear();
|
||||
let num_els = VInt::deserialize_u64(reader)?;
|
||||
let mut postings_start = VInt::deserialize_u64(reader)? as usize;
|
||||
let mut positions_start = VInt::deserialize_u64(reader)? as usize;
|
||||
let num_els = VInt::deserialize_u64(&mut data)?;
|
||||
let mut postings_start = VInt::deserialize_u64(&mut data)? as usize;
|
||||
let mut positions_start = VInt::deserialize_u64(&mut data)? as usize;
|
||||
for _ in 0..num_els {
|
||||
let doc_freq = VInt::deserialize_u64(reader)? as u32;
|
||||
let postings_num_bytes = VInt::deserialize_u64(reader)?;
|
||||
let positions_num_bytes = VInt::deserialize_u64(reader)?;
|
||||
let doc_freq = VInt::deserialize_u64(&mut data)? as u32;
|
||||
let postings_num_bytes = VInt::deserialize_u64(&mut data)?;
|
||||
let positions_num_bytes = VInt::deserialize_u64(&mut data)?;
|
||||
let postings_end = postings_start + postings_num_bytes as usize;
|
||||
let positions_end = positions_start + positions_num_bytes as usize;
|
||||
let term_info = TermInfo {
|
||||
@@ -55,7 +58,8 @@ impl ValueReader for TermInfoReader {
|
||||
postings_start = postings_end;
|
||||
positions_start = positions_end;
|
||||
}
|
||||
Ok(())
|
||||
let consumed_len = len_before - data.len();
|
||||
Ok(consumed_len)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +75,7 @@ impl ValueWriter for TermInfoWriter {
|
||||
self.term_infos.push(term_info.clone());
|
||||
}
|
||||
|
||||
fn write_block(&mut self, buffer: &mut Vec<u8>) {
|
||||
fn serialize_block(&mut self, buffer: &mut Vec<u8>) {
|
||||
VInt(self.term_infos.len() as u64).serialize_into_vec(buffer);
|
||||
if self.term_infos.is_empty() {
|
||||
return;
|
||||
@@ -89,17 +93,13 @@ impl ValueWriter for TermInfoWriter {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io;
|
||||
|
||||
use sstable::value::{ValueReader, ValueWriter};
|
||||
|
||||
use super::BlockReader;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::sstable_termdict::TermInfoReader;
|
||||
|
||||
#[test]
|
||||
fn test_block_terminfos() -> io::Result<()> {
|
||||
fn test_block_terminfos() {
|
||||
let mut term_info_writer = super::TermInfoWriter::default();
|
||||
term_info_writer.write(&TermInfo {
|
||||
doc_freq: 120u32,
|
||||
@@ -117,10 +117,10 @@ mod tests {
|
||||
positions_range: 1100..1302,
|
||||
});
|
||||
let mut buffer = Vec::new();
|
||||
term_info_writer.write_block(&mut buffer);
|
||||
let mut block_reader = make_block_reader(&buffer[..]);
|
||||
term_info_writer.serialize_block(&mut buffer);
|
||||
// let mut block_reader = make_block_reader(&buffer[..]);
|
||||
let mut term_info_reader = TermInfoReader::default();
|
||||
term_info_reader.read(&mut block_reader)?;
|
||||
let num_bytes: usize = term_info_reader.load(&buffer[..]).unwrap();
|
||||
assert_eq!(
|
||||
term_info_reader.value(0),
|
||||
&TermInfo {
|
||||
@@ -129,16 +129,6 @@ mod tests {
|
||||
positions_range: 10..122
|
||||
}
|
||||
);
|
||||
assert!(block_reader.buffer().is_empty());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_block_reader(data: &[u8]) -> BlockReader {
|
||||
let mut buffer = (data.len() as u32).to_le_bytes().to_vec();
|
||||
buffer.extend_from_slice(data);
|
||||
let owned_bytes = OwnedBytes::new(buffer);
|
||||
let mut block_reader = BlockReader::new(Box::new(owned_bytes));
|
||||
block_reader.read_block().unwrap();
|
||||
block_reader
|
||||
assert_eq!(buffer.len(), num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,256 +1,11 @@
|
||||
use std::io;
|
||||
use std::sync::Arc;
|
||||
use sstable::SSTable;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use once_cell::sync::Lazy;
|
||||
use sstable::{BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, Writer};
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::sstable_termdict::{
|
||||
TermInfoReader, TermInfoWriter, TermSSTable, TermStreamer, TermStreamerBuilder,
|
||||
};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::AsyncIoResult;
|
||||
use crate::termdict::sstable_termdict::{TermInfoReader, TermInfoWriter};
|
||||
|
||||
pub struct TermInfoSSTable;
|
||||
impl SSTable for TermInfoSSTable {
|
||||
type Value = TermInfo;
|
||||
type Reader = TermInfoReader;
|
||||
type Writer = TermInfoWriter;
|
||||
}
|
||||
|
||||
/// Builder for the new term dictionary.
|
||||
pub struct TermDictionaryBuilder<W: io::Write> {
|
||||
sstable_writer: Writer<W, TermInfoWriter>,
|
||||
}
|
||||
|
||||
impl<W: io::Write> TermDictionaryBuilder<W> {
|
||||
/// Creates a new `TermDictionaryBuilder`
|
||||
pub fn create(w: W) -> io::Result<Self> {
|
||||
let sstable_writer = TermSSTable::writer(w);
|
||||
Ok(TermDictionaryBuilder { sstable_writer })
|
||||
}
|
||||
|
||||
/// Inserts a `(key, value)` pair in the term dictionary.
|
||||
///
|
||||
/// *Keys have to be inserted in order.*
|
||||
pub fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
|
||||
let key = key_ref.as_ref();
|
||||
self.insert_key(key)?;
|
||||
self.insert_value(value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// # Warning
|
||||
/// Horribly dangerous internal API
|
||||
///
|
||||
/// If used, it must be used by systematically alternating calls
|
||||
/// to insert_key and insert_value.
|
||||
///
|
||||
/// Prefer using `.insert(key, value)`
|
||||
#[allow(clippy::unnecessary_wraps)]
|
||||
pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
|
||||
self.sstable_writer.write_key(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// # Warning
|
||||
///
|
||||
/// Horribly dangerous internal API. See `.insert_key(...)`.
|
||||
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
||||
self.sstable_writer.write_value(term_info)
|
||||
}
|
||||
|
||||
/// Finalize writing the builder, and returns the underlying
|
||||
/// `Write` object.
|
||||
pub fn finish(self) -> io::Result<W> {
|
||||
self.sstable_writer.finalize()
|
||||
}
|
||||
}
|
||||
|
||||
static EMPTY_TERM_DICT_FILE: Lazy<FileSlice> = Lazy::new(|| {
|
||||
let term_dictionary_data: Vec<u8> = TermDictionaryBuilder::create(Vec::<u8>::new())
|
||||
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
|
||||
.finish()
|
||||
.expect("Writing in a Vec<u8> should never fail");
|
||||
FileSlice::from(term_dictionary_data)
|
||||
});
|
||||
|
||||
/// The term dictionary contains all of the terms in
|
||||
/// `tantivy index` in a sorted manner.
|
||||
///
|
||||
/// The `Fst` crate is used to associate terms to their
|
||||
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
|
||||
/// possible to fetch the associated `TermInfo`.
|
||||
pub struct TermDictionary {
|
||||
sstable_slice: FileSlice,
|
||||
sstable_index: SSTableIndex,
|
||||
num_terms: u64,
|
||||
}
|
||||
|
||||
impl TermDictionary {
|
||||
pub(crate) fn sstable_reader(&self) -> io::Result<Reader<'static, TermInfoReader>> {
|
||||
let data = self.sstable_slice.read_bytes()?;
|
||||
Ok(TermInfoSSTable::reader(data))
|
||||
}
|
||||
|
||||
pub(crate) fn sstable_reader_block(
|
||||
&self,
|
||||
block_addr: BlockAddr,
|
||||
) -> io::Result<Reader<'static, TermInfoReader>> {
|
||||
let data = self.sstable_slice.read_bytes_slice(block_addr.byte_range)?;
|
||||
Ok(TermInfoSSTable::reader(data))
|
||||
}
|
||||
|
||||
pub(crate) async fn sstable_reader_block_async(
|
||||
&self,
|
||||
block_addr: BlockAddr,
|
||||
) -> AsyncIoResult<Reader<'static, TermInfoReader>> {
|
||||
let data = self
|
||||
.sstable_slice
|
||||
.read_bytes_slice_async(block_addr.byte_range)
|
||||
.await?;
|
||||
Ok(TermInfoSSTable::reader(data))
|
||||
}
|
||||
|
||||
pub(crate) fn sstable_delta_reader(&self) -> io::Result<DeltaReader<'static, TermInfoReader>> {
|
||||
let data = self.sstable_slice.read_bytes()?;
|
||||
Ok(TermInfoSSTable::delta_reader(data))
|
||||
}
|
||||
|
||||
/// Opens a `TermDictionary`.
|
||||
pub fn open(term_dictionary_file: FileSlice) -> crate::Result<Self> {
|
||||
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(16);
|
||||
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
|
||||
let index_offset = u64::deserialize(&mut footer_len_bytes)?;
|
||||
let num_terms = u64::deserialize(&mut footer_len_bytes)?;
|
||||
let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
|
||||
let sstable_index_bytes = index_slice.read_bytes()?;
|
||||
let sstable_index = SSTableIndex::load(sstable_index_bytes.as_slice())
|
||||
.map_err(|_| crate::error::DataCorruption::comment_only("SSTable corruption"))?;
|
||||
Ok(TermDictionary {
|
||||
sstable_slice,
|
||||
sstable_index,
|
||||
num_terms,
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates a term dictionary from the supplied bytes.
|
||||
pub fn from_bytes(owned_bytes: OwnedBytes) -> crate::Result<TermDictionary> {
|
||||
TermDictionary::open(FileSlice::new(Arc::new(owned_bytes)))
|
||||
}
|
||||
|
||||
/// Creates an empty term dictionary which contains no terms.
|
||||
pub fn empty() -> Self {
|
||||
TermDictionary::open(EMPTY_TERM_DICT_FILE.clone()).unwrap()
|
||||
}
|
||||
|
||||
/// Returns the number of terms in the dictionary.
|
||||
/// Term ordinals range from 0 to `num_terms() - 1`.
|
||||
pub fn num_terms(&self) -> usize {
|
||||
self.num_terms as usize
|
||||
}
|
||||
|
||||
/// Returns the ordinal associated with a given term.
|
||||
pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermOrdinal>> {
|
||||
let mut term_ord = 0u64;
|
||||
let key_bytes = key.as_ref();
|
||||
let mut sstable_reader = self.sstable_reader()?;
|
||||
while sstable_reader.advance().unwrap_or(false) {
|
||||
if sstable_reader.key() == key_bytes {
|
||||
return Ok(Some(term_ord));
|
||||
}
|
||||
term_ord += 1;
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Returns the term associated with a given term ordinal.
|
||||
///
|
||||
/// Term ordinals are defined as the position of the term in
|
||||
/// the sorted list of terms.
|
||||
///
|
||||
/// Returns true if and only if the term has been found.
|
||||
///
|
||||
/// Regardless of whether the term is found or not,
|
||||
/// the buffer may be modified.
|
||||
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
|
||||
let mut sstable_reader = self.sstable_reader()?;
|
||||
bytes.clear();
|
||||
for _ in 0..(ord + 1) {
|
||||
if !sstable_reader.advance().unwrap_or(false) {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
bytes.extend_from_slice(sstable_reader.key());
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Returns the number of terms in the dictionary.
|
||||
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<TermInfo> {
|
||||
let mut sstable_reader = self.sstable_reader()?;
|
||||
for _ in 0..(term_ord + 1) {
|
||||
if !sstable_reader.advance().unwrap_or(false) {
|
||||
return Ok(TermInfo::default());
|
||||
}
|
||||
}
|
||||
Ok(sstable_reader.value().clone())
|
||||
}
|
||||
|
||||
/// Lookups the value corresponding to the key.
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermInfo>> {
|
||||
if let Some(block_addr) = self.sstable_index.search(key.as_ref()) {
|
||||
let mut sstable_reader = self.sstable_reader_block(block_addr)?;
|
||||
let key_bytes = key.as_ref();
|
||||
while sstable_reader.advance().unwrap_or(false) {
|
||||
if sstable_reader.key() == key_bytes {
|
||||
let term_info = sstable_reader.value().clone();
|
||||
return Ok(Some(term_info));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Lookups the value corresponding to the key.
|
||||
pub async fn get_async<K: AsRef<[u8]>>(&self, key: K) -> AsyncIoResult<Option<TermInfo>> {
|
||||
if let Some(block_addr) = self.sstable_index.search(key.as_ref()) {
|
||||
let mut sstable_reader = self.sstable_reader_block_async(block_addr).await?;
|
||||
let key_bytes = key.as_ref();
|
||||
while sstable_reader.advance().unwrap_or(false) {
|
||||
if sstable_reader.key() == key_bytes {
|
||||
let term_info = sstable_reader.value().clone();
|
||||
return Ok(Some(term_info));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Returns a range builder, to stream all of the terms
|
||||
/// within an interval.
|
||||
pub fn range(&self) -> TermStreamerBuilder<'_> {
|
||||
TermStreamerBuilder::new(self, AlwaysMatch)
|
||||
}
|
||||
|
||||
/// A stream of all the sorted terms.
|
||||
pub fn stream(&self) -> io::Result<TermStreamer<'_>> {
|
||||
self.range().into_stream()
|
||||
}
|
||||
|
||||
/// Returns a search builder, to stream all of the terms
|
||||
/// within the Automaton
|
||||
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A>
|
||||
where A::State: Clone {
|
||||
TermStreamerBuilder::<A>::new(self, automaton)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub async fn warm_up_dictionary(&self) -> AsyncIoResult<()> {
|
||||
self.sstable_slice.read_bytes_async().await?;
|
||||
Ok(())
|
||||
}
|
||||
type ValueReader = TermInfoReader;
|
||||
type ValueWriter = TermInfoWriter;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::path::PathBuf;
|
||||
use std::str;
|
||||
use std::{io, str};
|
||||
|
||||
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
|
||||
use crate::directory::{Directory, FileSlice, RamDirectory, TerminatingWrite};
|
||||
@@ -247,7 +247,7 @@ fn test_empty_string() -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn stream_range_test_dict() -> crate::Result<TermDictionary> {
|
||||
fn stream_range_test_dict() -> io::Result<TermDictionary> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
|
||||
for i in 0u8..10u8 {
|
||||
|
||||
@@ -6,8 +6,8 @@ edition = "2021"
|
||||
[dependencies]
|
||||
common = {path="../common", package="tantivy-common"}
|
||||
ciborium = "0.2"
|
||||
byteorder = "1"
|
||||
serde = "1"
|
||||
tantivy-fst = "0.4"
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1"
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
use std::io::{self, Read};
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use std::io;
|
||||
|
||||
pub struct BlockReader<'a> {
|
||||
buffer: Vec<u8>,
|
||||
@@ -8,6 +6,13 @@ pub struct BlockReader<'a> {
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_u32(read: &mut dyn io::Read) -> io::Result<u32> {
|
||||
let mut buf = [0u8; 4];
|
||||
read.read_exact(&mut buf)?;
|
||||
Ok(u32::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
impl<'a> BlockReader<'a> {
|
||||
pub fn new(reader: Box<dyn io::Read + 'a>) -> BlockReader<'a> {
|
||||
BlockReader {
|
||||
@@ -30,7 +35,7 @@ impl<'a> BlockReader<'a> {
|
||||
|
||||
pub fn read_block(&mut self) -> io::Result<bool> {
|
||||
self.offset = 0;
|
||||
let block_len_res = self.reader.read_u32::<LittleEndian>();
|
||||
let block_len_res = read_u32(self.reader.as_mut());
|
||||
if let Err(err) = &block_len_res {
|
||||
if err.kind() == io::ErrorKind::UnexpectedEof {
|
||||
return Ok(false);
|
||||
|
||||
@@ -44,7 +44,7 @@ where
|
||||
let start_offset = self.write.written_bytes() as usize;
|
||||
// TODO avoid buffer allocation
|
||||
let mut buffer = Vec::new();
|
||||
self.value_writer.write_block(&mut buffer);
|
||||
self.value_writer.serialize_block(&mut buffer);
|
||||
let block_len = buffer.len() + self.block.len();
|
||||
self.write.write_all(&(block_len as u32).to_le_bytes())?;
|
||||
self.write.write_all(&buffer[..])?;
|
||||
@@ -84,7 +84,7 @@ where
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> CountingWriter<BufWriter<W>> {
|
||||
pub fn finish(self) -> CountingWriter<BufWriter<W>> {
|
||||
self.write
|
||||
}
|
||||
}
|
||||
@@ -112,6 +112,10 @@ where TValueReader: value::ValueReader
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty() -> Self {
|
||||
DeltaReader::new(&b""[..])
|
||||
}
|
||||
|
||||
fn deserialize_vint(&mut self) -> u64 {
|
||||
self.block_reader.deserialize_u64()
|
||||
}
|
||||
@@ -156,7 +160,8 @@ where TValueReader: value::ValueReader
|
||||
if !self.block_reader.read_block()? {
|
||||
return Ok(false);
|
||||
}
|
||||
self.value_reader.read(&mut self.block_reader)?;
|
||||
let consumed_len = self.value_reader.load(self.block_reader.buffer())?;
|
||||
self.block_reader.advance(consumed_len);
|
||||
self.idx = 0;
|
||||
} else {
|
||||
self.idx += 1;
|
||||
@@ -180,3 +185,15 @@ where TValueReader: value::ValueReader
|
||||
self.value_reader.value(self.idx)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::DeltaReader;
|
||||
use crate::value::U64MonotonicReader;
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let mut delta_reader: DeltaReader<U64MonotonicReader> = DeltaReader::empty();
|
||||
assert!(!delta_reader.advance().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
231
sstable/src/dictionary.rs
Normal file
231
sstable/src/dictionary.rs
Normal file
@@ -0,0 +1,231 @@
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::file_slice::FileSlice;
|
||||
use common::{BinarySerializable, OwnedBytes};
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
use crate::streamer::{Streamer, StreamerBuilder};
|
||||
use crate::{BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, TermOrdinal};
|
||||
|
||||
/// The term dictionary contains all of the terms in
|
||||
/// `tantivy index` in a sorted manner.
|
||||
///
|
||||
/// The `Fst` crate is used to associate terms to their
|
||||
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
|
||||
/// possible to fetch the associated `TermInfo`.
|
||||
pub struct Dictionary<TSSTable: SSTable> {
|
||||
pub sstable_slice: FileSlice,
|
||||
pub sstable_index: SSTableIndex,
|
||||
num_terms: u64,
|
||||
phantom_data: PhantomData<TSSTable>,
|
||||
}
|
||||
|
||||
impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
pub fn builder<W: io::Write>(wrt: W) -> io::Result<crate::Writer<W, TSSTable::ValueWriter>> {
|
||||
Ok(TSSTable::writer(wrt))
|
||||
}
|
||||
|
||||
pub(crate) fn sstable_reader(&self) -> io::Result<Reader<'static, TSSTable::ValueReader>> {
|
||||
let data = self.sstable_slice.read_bytes()?;
|
||||
Ok(TSSTable::reader(data))
|
||||
}
|
||||
|
||||
pub(crate) fn sstable_reader_block(
|
||||
&self,
|
||||
block_addr: BlockAddr,
|
||||
) -> io::Result<Reader<'static, TSSTable::ValueReader>> {
|
||||
let data = self.sstable_slice.read_bytes_slice(block_addr.byte_range)?;
|
||||
Ok(TSSTable::reader(data))
|
||||
}
|
||||
|
||||
pub(crate) async fn sstable_reader_block_async(
|
||||
&self,
|
||||
block_addr: BlockAddr,
|
||||
) -> io::Result<Reader<'static, TSSTable::ValueReader>> {
|
||||
let data = self
|
||||
.sstable_slice
|
||||
.read_bytes_slice_async(block_addr.byte_range)
|
||||
.await?;
|
||||
Ok(TSSTable::reader(data))
|
||||
}
|
||||
|
||||
pub(crate) fn sstable_delta_reader_for_key_range(
|
||||
&self,
|
||||
key_range: impl RangeBounds<[u8]>,
|
||||
) -> io::Result<DeltaReader<'static, TSSTable::ValueReader>> {
|
||||
let slice = self.file_slice_for_range(key_range);
|
||||
let data = slice.read_bytes()?;
|
||||
Ok(TSSTable::delta_reader(data))
|
||||
}
|
||||
|
||||
fn file_slice_for_range(&self, key_range: impl RangeBounds<[u8]>) -> FileSlice {
|
||||
let start_bound: Bound<usize> = match key_range.start_bound() {
|
||||
Bound::Included(key) | Bound::Excluded(key) => {
|
||||
let Some(first_block_addr) = self.sstable_index.search_block(key) else {
|
||||
return FileSlice::empty();
|
||||
};
|
||||
Bound::Included(first_block_addr.byte_range.start)
|
||||
}
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
let end_bound: Bound<usize> = match key_range.end_bound() {
|
||||
Bound::Included(key) | Bound::Excluded(key) => {
|
||||
if let Some(block_addr) = self.sstable_index.search_block(key) {
|
||||
Bound::Excluded(block_addr.byte_range.end)
|
||||
} else {
|
||||
Bound::Unbounded
|
||||
}
|
||||
}
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
self.sstable_slice.slice((start_bound, end_bound))
|
||||
}
|
||||
|
||||
/// Opens a `TermDictionary`.
|
||||
pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
|
||||
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(16);
|
||||
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
|
||||
let index_offset = u64::deserialize(&mut footer_len_bytes)?;
|
||||
let num_terms = u64::deserialize(&mut footer_len_bytes)?;
|
||||
let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
|
||||
let sstable_index_bytes = index_slice.read_bytes()?;
|
||||
let sstable_index = SSTableIndex::load(sstable_index_bytes.as_slice())
|
||||
.map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption"))?;
|
||||
Ok(Dictionary {
|
||||
sstable_slice,
|
||||
sstable_index,
|
||||
num_terms,
|
||||
phantom_data: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates a term dictionary from the supplied bytes.
|
||||
pub fn from_bytes(owned_bytes: OwnedBytes) -> io::Result<Self> {
|
||||
Dictionary::open(FileSlice::new(Arc::new(owned_bytes)))
|
||||
}
|
||||
|
||||
/// Creates an empty term dictionary which contains no terms.
|
||||
pub fn empty() -> Self {
|
||||
let term_dictionary_data: Vec<u8> = Self::builder(Vec::<u8>::new())
|
||||
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
|
||||
.finish()
|
||||
.expect("Writing in a Vec<u8> should never fail");
|
||||
let empty_dict_file = FileSlice::from(term_dictionary_data);
|
||||
Dictionary::open(empty_dict_file).unwrap()
|
||||
}
|
||||
|
||||
/// Returns the number of terms in the dictionary.
|
||||
/// Term ordinals range from 0 to `num_terms() - 1`.
|
||||
pub fn num_terms(&self) -> usize {
|
||||
self.num_terms as usize
|
||||
}
|
||||
|
||||
/// Returns the ordinal associated with a given term.
|
||||
pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermOrdinal>> {
|
||||
let mut term_ord = 0u64;
|
||||
let key_bytes = key.as_ref();
|
||||
let mut sstable_reader = self.sstable_reader()?;
|
||||
while sstable_reader.advance().unwrap_or(false) {
|
||||
if sstable_reader.key() == key_bytes {
|
||||
return Ok(Some(term_ord));
|
||||
}
|
||||
term_ord += 1;
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Returns the term associated with a given term ordinal.
|
||||
///
|
||||
/// Term ordinals are defined as the position of the term in
|
||||
/// the sorted list of terms.
|
||||
///
|
||||
/// Returns true if and only if the term has been found.
|
||||
///
|
||||
/// Regardless of whether the term is found or not,
|
||||
/// the buffer may be modified.
|
||||
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
|
||||
let mut sstable_reader = self.sstable_reader()?;
|
||||
bytes.clear();
|
||||
for _ in 0..(ord + 1) {
|
||||
if !sstable_reader.advance().unwrap_or(false) {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
bytes.extend_from_slice(sstable_reader.key());
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Returns the number of terms in the dictionary.
|
||||
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> {
|
||||
let mut sstable_reader = self.sstable_reader()?;
|
||||
for _ in 0..(term_ord + 1) {
|
||||
if !sstable_reader.advance().unwrap_or(false) {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
Ok(Some(sstable_reader.value().clone()))
|
||||
}
|
||||
|
||||
/// Lookups the value corresponding to the key.
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TSSTable::Value>> {
|
||||
if let Some(block_addr) = self.sstable_index.search_block(key.as_ref()) {
|
||||
let mut sstable_reader = self.sstable_reader_block(block_addr)?;
|
||||
let key_bytes = key.as_ref();
|
||||
while sstable_reader.advance().unwrap_or(false) {
|
||||
if sstable_reader.key() == key_bytes {
|
||||
let value = sstable_reader.value().clone();
|
||||
return Ok(Some(value));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Lookups the value corresponding to the key.
|
||||
pub async fn get_async<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TSSTable::Value>> {
|
||||
if let Some(block_addr) = self.sstable_index.search_block(key.as_ref()) {
|
||||
let mut sstable_reader = self.sstable_reader_block_async(block_addr).await?;
|
||||
let key_bytes = key.as_ref();
|
||||
while sstable_reader.advance().unwrap_or(false) {
|
||||
if sstable_reader.key() == key_bytes {
|
||||
let value = sstable_reader.value().clone();
|
||||
return Ok(Some(value));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Returns a range builder, to stream all of the terms
|
||||
/// within an interval.
|
||||
pub fn range(&self) -> StreamerBuilder<'_, TSSTable> {
|
||||
StreamerBuilder::new(self, AlwaysMatch)
|
||||
}
|
||||
|
||||
/// A stream of all the sorted terms.
|
||||
pub fn stream(&self) -> io::Result<Streamer<'_, TSSTable>> {
|
||||
self.range().into_stream()
|
||||
}
|
||||
|
||||
/// Returns a search builder, to stream all of the terms
|
||||
/// within the Automaton
|
||||
pub fn search<'a, A: Automaton + 'a>(
|
||||
&'a self,
|
||||
automaton: A,
|
||||
) -> StreamerBuilder<'a, TSSTable, A>
|
||||
where
|
||||
A::State: Clone,
|
||||
{
|
||||
StreamerBuilder::<TSSTable, A>::new(self, automaton)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub async fn warm_up_dictionary(&self) -> io::Result<()> {
|
||||
self.sstable_slice.read_bytes_async().await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,21 +1,29 @@
|
||||
use std::io::{self, Write};
|
||||
use std::ops::Range;
|
||||
use std::usize;
|
||||
|
||||
use merge::ValueMerger;
|
||||
|
||||
mod delta;
|
||||
mod dictionary;
|
||||
pub mod merge;
|
||||
mod streamer;
|
||||
pub mod value;
|
||||
|
||||
mod sstable_index;
|
||||
pub use sstable_index::{BlockAddr, SSTableIndex, SSTableIndexBuilder};
|
||||
pub(crate) mod vint;
|
||||
pub use dictionary::Dictionary;
|
||||
pub use streamer::{Streamer, StreamerBuilder};
|
||||
|
||||
mod block_reader;
|
||||
pub use self::block_reader::BlockReader;
|
||||
pub use self::delta::{DeltaReader, DeltaWriter};
|
||||
pub use self::merge::VoidMerge;
|
||||
use self::value::{U64MonotonicReader, U64MonotonicWriter, ValueReader, ValueWriter};
|
||||
use crate::value::{RangeReader, RangeWriter};
|
||||
|
||||
pub type TermOrdinal = u64;
|
||||
|
||||
const DEFAULT_KEY_CAPACITY: usize = 50;
|
||||
|
||||
@@ -31,15 +39,15 @@ fn common_prefix_len(left: &[u8], right: &[u8]) -> usize {
|
||||
pub struct SSTableDataCorruption;
|
||||
|
||||
pub trait SSTable: Sized {
|
||||
type Value;
|
||||
type Reader: ValueReader<Value = Self::Value>;
|
||||
type Writer: ValueWriter<Value = Self::Value>;
|
||||
type Value: Clone;
|
||||
type ValueReader: ValueReader<Value = Self::Value>;
|
||||
type ValueWriter: ValueWriter<Value = Self::Value>;
|
||||
|
||||
fn delta_writer<W: io::Write>(write: W) -> DeltaWriter<W, Self::Writer> {
|
||||
fn delta_writer<W: io::Write>(write: W) -> DeltaWriter<W, Self::ValueWriter> {
|
||||
DeltaWriter::new(write)
|
||||
}
|
||||
|
||||
fn writer<W: io::Write>(write: W) -> Writer<W, Self::Writer> {
|
||||
fn writer<W: io::Write>(write: W) -> Writer<W, Self::ValueWriter> {
|
||||
Writer {
|
||||
previous_key: Vec::with_capacity(DEFAULT_KEY_CAPACITY),
|
||||
num_terms: 0u64,
|
||||
@@ -49,17 +57,22 @@ pub trait SSTable: Sized {
|
||||
}
|
||||
}
|
||||
|
||||
fn delta_reader<'a, R: io::Read + 'a>(reader: R) -> DeltaReader<'a, Self::Reader> {
|
||||
fn delta_reader<'a, R: io::Read + 'a>(reader: R) -> DeltaReader<'a, Self::ValueReader> {
|
||||
DeltaReader::new(reader)
|
||||
}
|
||||
|
||||
fn reader<'a, R: io::Read + 'a>(reader: R) -> Reader<'a, Self::Reader> {
|
||||
fn reader<'a, R: io::Read + 'a>(reader: R) -> Reader<'a, Self::ValueReader> {
|
||||
Reader {
|
||||
key: Vec::with_capacity(DEFAULT_KEY_CAPACITY),
|
||||
delta_reader: Self::delta_reader(reader),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty static reader.
|
||||
fn create_empty_reader() -> Reader<'static, Self::ValueReader> {
|
||||
Self::reader(&b""[..])
|
||||
}
|
||||
|
||||
fn merge<R: io::Read, W: io::Write, M: ValueMerger<Self::Value>>(
|
||||
io_readers: Vec<R>,
|
||||
w: W,
|
||||
@@ -76,8 +89,8 @@ pub struct VoidSSTable;
|
||||
|
||||
impl SSTable for VoidSSTable {
|
||||
type Value = ();
|
||||
type Reader = value::VoidReader;
|
||||
type Writer = value::VoidWriter;
|
||||
type ValueReader = value::VoidReader;
|
||||
type ValueWriter = value::VoidWriter;
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
@@ -86,9 +99,20 @@ pub struct SSTableMonotonicU64;
|
||||
impl SSTable for SSTableMonotonicU64 {
|
||||
type Value = u64;
|
||||
|
||||
type Reader = U64MonotonicReader;
|
||||
type ValueReader = U64MonotonicReader;
|
||||
|
||||
type Writer = U64MonotonicWriter;
|
||||
type ValueWriter = U64MonotonicWriter;
|
||||
}
|
||||
|
||||
/// Retpresent
|
||||
pub struct SSTableRange;
|
||||
|
||||
impl SSTable for SSTableRange {
|
||||
type Value = Range<u64>;
|
||||
|
||||
type ValueReader = RangeReader;
|
||||
|
||||
type ValueWriter = RangeWriter;
|
||||
}
|
||||
|
||||
pub struct Reader<'a, TValueReader> {
|
||||
@@ -141,11 +165,23 @@ where
|
||||
W: io::Write,
|
||||
TValueWriter: value::ValueWriter,
|
||||
{
|
||||
pub fn create(wrt: W) -> io::Result<Self> {
|
||||
Ok(Writer {
|
||||
previous_key: Vec::with_capacity(DEFAULT_KEY_CAPACITY),
|
||||
num_terms: 0u64,
|
||||
index_builder: SSTableIndexBuilder::default(),
|
||||
delta_writer: DeltaWriter::new(wrt),
|
||||
first_ordinal_of_the_block: 0u64,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn current_key(&self) -> &[u8] {
|
||||
&self.previous_key[..]
|
||||
}
|
||||
|
||||
pub fn write_key(&mut self, key: &[u8]) {
|
||||
#[inline]
|
||||
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
|
||||
// If this is the first key in the block, we use it to
|
||||
// shorten the last term in the last block.
|
||||
if self.first_ordinal_of_the_block == self.num_terms {
|
||||
@@ -165,16 +201,22 @@ where
|
||||
self.previous_key.resize(key.len(), 0u8);
|
||||
self.previous_key[keep_len..].copy_from_slice(&key[keep_len..]);
|
||||
self.delta_writer.write_suffix(keep_len, &key[keep_len..]);
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn write(&mut self, key: &[u8], value: &TValueWriter::Value) -> io::Result<()> {
|
||||
self.write_key(key);
|
||||
self.write_value(value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_value(&mut self, value: &TValueWriter::Value) -> io::Result<()> {
|
||||
#[inline]
|
||||
pub fn insert<K: AsRef<[u8]>>(
|
||||
&mut self,
|
||||
key: K,
|
||||
value: &TValueWriter::Value,
|
||||
) -> io::Result<()> {
|
||||
self.insert_key(key.as_ref())?;
|
||||
self.insert_value(value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn insert_value(&mut self, value: &TValueWriter::Value) -> io::Result<()> {
|
||||
self.delta_writer.write_value(value);
|
||||
self.num_terms += 1u64;
|
||||
self.flush_block_if_required()
|
||||
@@ -193,7 +235,7 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finalize(mut self) -> io::Result<W> {
|
||||
pub fn finish(mut self) -> io::Result<W> {
|
||||
if let Some(byte_range) = self.delta_writer.flush_block()? {
|
||||
self.index_builder.add_block(
|
||||
&self.previous_key[..],
|
||||
@@ -202,7 +244,7 @@ where
|
||||
);
|
||||
self.first_ordinal_of_the_block = self.num_terms;
|
||||
}
|
||||
let mut wrt = self.delta_writer.finalize();
|
||||
let mut wrt = self.delta_writer.finish();
|
||||
wrt.write_all(&0u32.to_le_bytes())?;
|
||||
|
||||
let offset = wrt.written_bytes();
|
||||
@@ -246,10 +288,10 @@ mod test {
|
||||
let mut buffer = vec![];
|
||||
{
|
||||
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
|
||||
assert!(sstable_writer.write(&long_key[..], &()).is_ok());
|
||||
assert!(sstable_writer.write(&[0, 3, 4], &()).is_ok());
|
||||
assert!(sstable_writer.write(&long_key2[..], &()).is_ok());
|
||||
assert!(sstable_writer.finalize().is_ok());
|
||||
assert!(sstable_writer.insert(&long_key[..], &()).is_ok());
|
||||
assert!(sstable_writer.insert(&[0, 3, 4], &()).is_ok());
|
||||
assert!(sstable_writer.insert(&long_key2[..], &()).is_ok());
|
||||
assert!(sstable_writer.finish().is_ok());
|
||||
}
|
||||
let mut sstable_reader = VoidSSTable::reader(&buffer[..]);
|
||||
assert!(sstable_reader.advance().unwrap());
|
||||
@@ -266,10 +308,10 @@ mod test {
|
||||
let mut buffer = vec![];
|
||||
{
|
||||
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
|
||||
assert!(sstable_writer.write(&[17u8], &()).is_ok());
|
||||
assert!(sstable_writer.write(&[17u8, 18u8, 19u8], &()).is_ok());
|
||||
assert!(sstable_writer.write(&[17u8, 20u8], &()).is_ok());
|
||||
assert!(sstable_writer.finalize().is_ok());
|
||||
assert!(sstable_writer.insert(&[17u8], &()).is_ok());
|
||||
assert!(sstable_writer.insert(&[17u8, 18u8, 19u8], &()).is_ok());
|
||||
assert!(sstable_writer.insert(&[17u8, 20u8], &()).is_ok());
|
||||
assert!(sstable_writer.finish().is_ok());
|
||||
}
|
||||
assert_eq!(
|
||||
&buffer,
|
||||
@@ -304,8 +346,8 @@ mod test {
|
||||
fn test_simple_sstable_non_increasing_key() {
|
||||
let mut buffer = vec![];
|
||||
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
|
||||
assert!(sstable_writer.write(&[17u8], &()).is_ok());
|
||||
assert!(sstable_writer.write(&[16u8], &()).is_ok());
|
||||
assert!(sstable_writer.insert(&[17u8], &()).is_ok());
|
||||
assert!(sstable_writer.insert(&[16u8], &()).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -313,9 +355,9 @@ mod test {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut writer = VoidSSTable::writer(&mut buffer);
|
||||
writer.write(b"abcd", &()).unwrap();
|
||||
writer.write(b"abe", &()).unwrap();
|
||||
writer.finalize().unwrap();
|
||||
writer.insert(b"abcd", &()).unwrap();
|
||||
writer.insert(b"abe", &()).unwrap();
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
let mut output = Vec::new();
|
||||
assert!(VoidSSTable::merge(vec![&buffer[..], &buffer[..]], &mut output, VoidMerge).is_ok());
|
||||
@@ -327,9 +369,9 @@ mod test {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut writer = VoidSSTable::writer(&mut buffer);
|
||||
writer.write(b"abcd", &()).unwrap();
|
||||
writer.write(b"abe", &()).unwrap();
|
||||
writer.finalize().unwrap();
|
||||
writer.insert(b"abcd", &()).unwrap();
|
||||
writer.insert(b"abe", &()).unwrap();
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
let mut output = Vec::new();
|
||||
assert!(VoidSSTable::merge(vec![&buffer[..], &buffer[..]], &mut output, VoidMerge).is_ok());
|
||||
@@ -340,10 +382,10 @@ mod test {
|
||||
fn test_sstable_u64() -> io::Result<()> {
|
||||
let mut buffer = Vec::new();
|
||||
let mut writer = SSTableMonotonicU64::writer(&mut buffer);
|
||||
writer.write(b"abcd", &1u64)?;
|
||||
writer.write(b"abe", &4u64)?;
|
||||
writer.write(b"gogo", &4324234234234234u64)?;
|
||||
writer.finalize()?;
|
||||
writer.insert(b"abcd", &1u64)?;
|
||||
writer.insert(b"abe", &4u64)?;
|
||||
writer.insert(b"gogo", &4324234234234234u64)?;
|
||||
writer.finish()?;
|
||||
let mut reader = SSTableMonotonicU64::reader(&buffer[..]);
|
||||
assert!(reader.advance()?);
|
||||
assert_eq!(reader.key(), b"abcd");
|
||||
@@ -357,4 +399,10 @@ mod test {
|
||||
assert!(!reader.advance()?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sstable_empty() {
|
||||
let mut sstable_range_empty = crate::SSTableRange::create_empty_reader();
|
||||
assert!(!sstable_range_empty.advance().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,11 +28,11 @@ impl<B: AsRef<[u8]>> PartialEq for HeapItem<B> {
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
|
||||
readers: Vec<Reader<SST::Reader>>,
|
||||
mut writer: Writer<W, SST::Writer>,
|
||||
readers: Vec<Reader<SST::ValueReader>>,
|
||||
mut writer: Writer<W, SST::ValueWriter>,
|
||||
mut merger: M,
|
||||
) -> io::Result<()> {
|
||||
let mut heap: BinaryHeap<HeapItem<Reader<SST::Reader>>> =
|
||||
let mut heap: BinaryHeap<HeapItem<Reader<SST::ValueReader>>> =
|
||||
BinaryHeap::with_capacity(readers.len());
|
||||
for mut reader in readers {
|
||||
if reader.advance()? {
|
||||
@@ -43,7 +43,7 @@ pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
|
||||
let len = heap.len();
|
||||
let mut value_merger;
|
||||
if let Some(mut head) = heap.peek_mut() {
|
||||
writer.write_key(head.0.key());
|
||||
writer.insert_key(head.0.key()).unwrap();
|
||||
value_merger = merger.new_value(head.0.value());
|
||||
if !head.0.advance()? {
|
||||
PeekMut::pop(head);
|
||||
@@ -64,9 +64,9 @@ pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
|
||||
break;
|
||||
}
|
||||
let value = value_merger.finish();
|
||||
writer.write_value(&value)?;
|
||||
writer.insert_value(&value)?;
|
||||
writer.flush_block_if_required()?;
|
||||
}
|
||||
writer.finalize()?;
|
||||
writer.finish()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -79,9 +79,9 @@ mod tests {
|
||||
{
|
||||
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
|
||||
for &key in keys {
|
||||
assert!(sstable_writer.write(key.as_bytes(), &()).is_ok());
|
||||
assert!(sstable_writer.insert(key.as_bytes(), &()).is_ok());
|
||||
}
|
||||
assert!(sstable_writer.finalize().is_ok());
|
||||
assert!(sstable_writer.finish().is_ok());
|
||||
}
|
||||
buffer
|
||||
}
|
||||
@@ -91,9 +91,9 @@ mod tests {
|
||||
{
|
||||
let mut sstable_writer = SSTableMonotonicU64::writer(&mut buffer);
|
||||
for (key, val) in keys {
|
||||
assert!(sstable_writer.write(key.as_bytes(), val).is_ok());
|
||||
assert!(sstable_writer.insert(key.as_bytes(), val).is_ok());
|
||||
}
|
||||
assert!(sstable_writer.finalize().is_ok());
|
||||
assert!(sstable_writer.finish().is_ok());
|
||||
}
|
||||
buffer
|
||||
}
|
||||
|
||||
@@ -15,10 +15,17 @@ impl SSTableIndex {
|
||||
ciborium::de::from_reader(data).map_err(|_| SSTableDataCorruption)
|
||||
}
|
||||
|
||||
pub fn search(&self, key: &[u8]) -> Option<BlockAddr> {
|
||||
pub fn search_block(&self, key: &[u8]) -> Option<BlockAddr> {
|
||||
self.search_block_from(key).next()
|
||||
}
|
||||
|
||||
pub fn search_block_from<'key, 'slf: 'key>(
|
||||
&'slf self,
|
||||
key: &'key [u8],
|
||||
) -> impl Iterator<Item = BlockAddr> + Clone + 'key {
|
||||
self.blocks
|
||||
.iter()
|
||||
.find(|block| &block.last_key_or_greater[..] >= key)
|
||||
.skip_while(|block| &block.last_key_or_greater[..] < key)
|
||||
.map(|block| block.block_addr.clone())
|
||||
}
|
||||
}
|
||||
@@ -105,7 +112,7 @@ mod tests {
|
||||
sstable_builder.serialize(&mut buffer).unwrap();
|
||||
let sstable_index = SSTableIndex::load(&buffer[..]).unwrap();
|
||||
assert_eq!(
|
||||
sstable_index.search(b"bbbde"),
|
||||
sstable_index.search_block(b"bbbde"),
|
||||
Some(BlockAddr {
|
||||
first_ordinal: 10u64,
|
||||
byte_range: 30..40
|
||||
|
||||
@@ -4,31 +4,39 @@ use std::ops::Bound;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
use super::TermDictionary;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::sstable_termdict::TermInfoReader;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::dictionary::Dictionary;
|
||||
use crate::{SSTable, TermOrdinal};
|
||||
|
||||
/// `TermStreamerBuilder` is a helper object used to define
|
||||
/// `StreamerBuilder` is a helper object used to define
|
||||
/// a range of terms that should be streamed.
|
||||
pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
|
||||
pub struct StreamerBuilder<'a, TSSTable, A = AlwaysMatch>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
TSSTable: SSTable,
|
||||
{
|
||||
term_dict: &'a TermDictionary,
|
||||
term_dict: &'a Dictionary<TSSTable>,
|
||||
automaton: A,
|
||||
lower: Bound<Vec<u8>>,
|
||||
upper: Bound<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl<'a, A> TermStreamerBuilder<'a, A>
|
||||
fn bound_as_byte_slice(bound: &Bound<Vec<u8>>) -> Bound<&[u8]> {
|
||||
match bound.as_ref() {
|
||||
Bound::Included(key) => Bound::Included(key.as_slice()),
|
||||
Bound::Excluded(key) => Bound::Excluded(key.as_slice()),
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, TSSTable, A> StreamerBuilder<'a, TSSTable, A>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
TSSTable: SSTable,
|
||||
{
|
||||
pub(crate) fn new(term_dict: &'a TermDictionary, automaton: A) -> Self {
|
||||
TermStreamerBuilder {
|
||||
pub(crate) fn new(term_dict: &'a Dictionary<TSSTable>, automaton: A) -> Self {
|
||||
StreamerBuilder {
|
||||
term_dict,
|
||||
automaton,
|
||||
lower: Bound::Unbounded,
|
||||
@@ -61,12 +69,18 @@ where
|
||||
}
|
||||
|
||||
/// Creates the stream corresponding to the range
|
||||
/// of terms defined using the `TermStreamerBuilder`.
|
||||
pub fn into_stream(self) -> io::Result<TermStreamer<'a, A>> {
|
||||
/// of terms defined using the `StreamerBuilder`.
|
||||
pub fn into_stream(self) -> io::Result<Streamer<'a, TSSTable, A>> {
|
||||
// TODO Optimize by skipping to the right first block.
|
||||
let start_state = self.automaton.start();
|
||||
let delta_reader = self.term_dict.sstable_delta_reader()?;
|
||||
Ok(TermStreamer {
|
||||
let key_range = (
|
||||
bound_as_byte_slice(&self.lower),
|
||||
bound_as_byte_slice(&self.upper),
|
||||
);
|
||||
let delta_reader = self
|
||||
.term_dict
|
||||
.sstable_delta_reader_for_key_range(key_range)?;
|
||||
Ok(Streamer {
|
||||
automaton: self.automaton,
|
||||
states: vec![start_state],
|
||||
delta_reader,
|
||||
@@ -78,26 +92,28 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
|
||||
/// `Streamer` acts as a cursor over a range of terms of a segment.
|
||||
/// Terms are guaranteed to be sorted.
|
||||
pub struct TermStreamer<'a, A = AlwaysMatch>
|
||||
pub struct Streamer<'a, TSSTable, A = AlwaysMatch>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
TSSTable: SSTable,
|
||||
{
|
||||
automaton: A,
|
||||
states: Vec<A::State>,
|
||||
delta_reader: sstable::DeltaReader<'a, TermInfoReader>,
|
||||
delta_reader: crate::DeltaReader<'a, TSSTable::ValueReader>,
|
||||
key: Vec<u8>,
|
||||
term_ord: Option<TermOrdinal>,
|
||||
lower_bound: Bound<Vec<u8>>,
|
||||
upper_bound: Bound<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl<'a, A> TermStreamer<'a, A>
|
||||
impl<'a, TSSTable, A> Streamer<'a, TSSTable, A>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
TSSTable: SSTable,
|
||||
{
|
||||
/// Advance position the stream on the next item.
|
||||
/// Before the first call to `.advance()`, the stream
|
||||
@@ -174,13 +190,13 @@ where
|
||||
///
|
||||
/// Calling `.value()` before the first call to `.advance()` returns
|
||||
/// `V::default()`.
|
||||
pub fn value(&self) -> &TermInfo {
|
||||
pub fn value(&self) -> &TSSTable::Value {
|
||||
self.delta_reader.value()
|
||||
}
|
||||
|
||||
/// Return the next `(key, value)` pair.
|
||||
#[allow(clippy::should_implement_trait)]
|
||||
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
|
||||
pub fn next(&mut self) -> Option<(&[u8], &TSSTable::Value)> {
|
||||
if self.advance() {
|
||||
Some((self.key(), self.value()))
|
||||
} else {
|
||||
@@ -191,60 +207,54 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::TermDictionary;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::postings::TermInfo;
|
||||
use std::io;
|
||||
|
||||
fn make_term_info(i: usize) -> TermInfo {
|
||||
TermInfo {
|
||||
doc_freq: 1000u32 + i as u32,
|
||||
postings_range: (i + 10) * (i * 10)..((i + 1) + 10) * ((i + 1) * 10),
|
||||
positions_range: i * 500..(i + 1) * 500,
|
||||
}
|
||||
}
|
||||
use common::OwnedBytes;
|
||||
|
||||
fn create_test_term_dictionary() -> crate::Result<TermDictionary> {
|
||||
let mut term_dict_builder = super::super::TermDictionaryBuilder::create(Vec::new())?;
|
||||
term_dict_builder.insert(b"abaisance", &make_term_info(0))?;
|
||||
term_dict_builder.insert(b"abalation", &make_term_info(1))?;
|
||||
term_dict_builder.insert(b"abalienate", &make_term_info(2))?;
|
||||
term_dict_builder.insert(b"abandon", &make_term_info(3))?;
|
||||
let buffer = term_dict_builder.finish()?;
|
||||
use crate::{Dictionary, SSTableMonotonicU64};
|
||||
|
||||
fn create_test_dictionary() -> io::Result<Dictionary<SSTableMonotonicU64>> {
|
||||
let mut dict_builder = Dictionary::<SSTableMonotonicU64>::builder(Vec::new())?;
|
||||
dict_builder.insert(b"abaisance", &0)?;
|
||||
dict_builder.insert(b"abalation", &1)?;
|
||||
dict_builder.insert(b"abalienate", &2)?;
|
||||
dict_builder.insert(b"abandon", &3)?;
|
||||
let buffer = dict_builder.finish()?;
|
||||
let owned_bytes = OwnedBytes::new(buffer);
|
||||
TermDictionary::from_bytes(owned_bytes)
|
||||
Dictionary::from_bytes(owned_bytes)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sstable_stream() -> crate::Result<()> {
|
||||
let term_dict = create_test_term_dictionary()?;
|
||||
let mut term_streamer = term_dict.stream()?;
|
||||
assert!(term_streamer.advance());
|
||||
assert_eq!(term_streamer.key(), b"abaisance");
|
||||
assert_eq!(term_streamer.value().doc_freq, 1000u32);
|
||||
assert!(term_streamer.advance());
|
||||
assert_eq!(term_streamer.key(), b"abalation");
|
||||
assert_eq!(term_streamer.value().doc_freq, 1001u32);
|
||||
assert!(term_streamer.advance());
|
||||
assert_eq!(term_streamer.key(), b"abalienate");
|
||||
assert_eq!(term_streamer.value().doc_freq, 1002u32);
|
||||
assert!(term_streamer.advance());
|
||||
assert_eq!(term_streamer.key(), b"abandon");
|
||||
assert_eq!(term_streamer.value().doc_freq, 1003u32);
|
||||
assert!(!term_streamer.advance());
|
||||
fn test_sstable_stream() -> io::Result<()> {
|
||||
let dict = create_test_dictionary()?;
|
||||
let mut streamer = dict.stream()?;
|
||||
assert!(streamer.advance());
|
||||
assert_eq!(streamer.key(), b"abaisance");
|
||||
assert_eq!(streamer.value(), &0);
|
||||
assert!(streamer.advance());
|
||||
assert_eq!(streamer.key(), b"abalation");
|
||||
assert_eq!(streamer.value(), &1);
|
||||
assert!(streamer.advance());
|
||||
assert_eq!(streamer.key(), b"abalienate");
|
||||
assert_eq!(streamer.value(), &2);
|
||||
assert!(streamer.advance());
|
||||
assert_eq!(streamer.key(), b"abandon");
|
||||
assert_eq!(streamer.value(), &3);
|
||||
assert!(!streamer.advance());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sstable_search() -> crate::Result<()> {
|
||||
let term_dict = create_test_term_dictionary()?;
|
||||
fn test_sstable_search() -> io::Result<()> {
|
||||
let term_dict = create_test_dictionary()?;
|
||||
let ptn = tantivy_fst::Regex::new("ab.*t.*").unwrap();
|
||||
let mut term_streamer = term_dict.search(ptn).into_stream()?;
|
||||
assert!(term_streamer.advance());
|
||||
assert_eq!(term_streamer.key(), b"abalation");
|
||||
assert_eq!(term_streamer.value().doc_freq, 1001u32);
|
||||
assert_eq!(term_streamer.value(), &1u64);
|
||||
assert!(term_streamer.advance());
|
||||
assert_eq!(term_streamer.key(), b"abalienate");
|
||||
assert_eq!(term_streamer.value().doc_freq, 1002u32);
|
||||
assert_eq!(term_streamer.value(), &2u64);
|
||||
assert!(!term_streamer.advance());
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,95 +0,0 @@
|
||||
use std::io;
|
||||
|
||||
use super::{vint, BlockReader};
|
||||
|
||||
pub trait ValueReader: Default {
|
||||
type Value;
|
||||
|
||||
fn value(&self, idx: usize) -> &Self::Value;
|
||||
|
||||
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()>;
|
||||
}
|
||||
|
||||
pub trait ValueWriter: Default {
|
||||
type Value;
|
||||
|
||||
fn write(&mut self, val: &Self::Value);
|
||||
|
||||
fn write_block(&mut self, writer: &mut Vec<u8>);
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VoidReader;
|
||||
|
||||
impl ValueReader for VoidReader {
|
||||
type Value = ();
|
||||
|
||||
fn value(&self, _idx: usize) -> &() {
|
||||
&()
|
||||
}
|
||||
|
||||
fn read(&mut self, _reader: &mut BlockReader) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VoidWriter;
|
||||
|
||||
impl ValueWriter for VoidWriter {
|
||||
type Value = ();
|
||||
|
||||
fn write(&mut self, _val: &()) {}
|
||||
|
||||
fn write_block(&mut self, _writer: &mut Vec<u8>) {}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct U64MonotonicWriter {
|
||||
vals: Vec<u64>,
|
||||
}
|
||||
|
||||
impl ValueWriter for U64MonotonicWriter {
|
||||
type Value = u64;
|
||||
|
||||
fn write(&mut self, val: &Self::Value) {
|
||||
self.vals.push(*val);
|
||||
}
|
||||
|
||||
fn write_block(&mut self, writer: &mut Vec<u8>) {
|
||||
let mut prev_val = 0u64;
|
||||
vint::serialize_into_vec(self.vals.len() as u64, writer);
|
||||
for &val in &self.vals {
|
||||
let delta = val - prev_val;
|
||||
vint::serialize_into_vec(delta, writer);
|
||||
prev_val = val;
|
||||
}
|
||||
self.vals.clear();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct U64MonotonicReader {
|
||||
vals: Vec<u64>,
|
||||
}
|
||||
|
||||
impl ValueReader for U64MonotonicReader {
|
||||
type Value = u64;
|
||||
|
||||
fn value(&self, idx: usize) -> &Self::Value {
|
||||
&self.vals[idx]
|
||||
}
|
||||
|
||||
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()> {
|
||||
let len = reader.deserialize_u64() as usize;
|
||||
self.vals.clear();
|
||||
let mut prev_val = 0u64;
|
||||
for _ in 0..len {
|
||||
let delta = reader.deserialize_u64();
|
||||
let val = prev_val + delta;
|
||||
self.vals.push(val);
|
||||
prev_val = val;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
75
sstable/src/value/mod.rs
Normal file
75
sstable/src/value/mod.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
mod range;
|
||||
mod u64_monotonic;
|
||||
mod void;
|
||||
|
||||
use std::io;
|
||||
|
||||
/// `ValueReader` is a trait describing the contract of something
|
||||
/// reading blocks of value, and offering random access within this values.
|
||||
pub trait ValueReader: Default {
|
||||
/// Type of the value being read.
|
||||
type Value;
|
||||
|
||||
/// Access the value at index `idx`, in the last block that was read
|
||||
/// via a call to `ValueReader::read`.
|
||||
fn value(&self, idx: usize) -> &Self::Value;
|
||||
|
||||
/// Loads a block.
|
||||
///
|
||||
/// Returns the number of bytes that were written.
|
||||
fn load(&mut self, data: &[u8]) -> io::Result<usize>;
|
||||
}
|
||||
|
||||
pub trait ValueWriter: Default {
|
||||
/// Type of the value being written.
|
||||
type Value;
|
||||
|
||||
/// Records a new value.
|
||||
/// This method usually just accumulates data in a `Vec`,
|
||||
/// only to be serialized on the call to `ValueWriter::write_block`.
|
||||
fn write(&mut self, val: &Self::Value);
|
||||
|
||||
/// Serializes the accumulated values into the output buffer.
|
||||
fn serialize_block(&mut self, output: &mut Vec<u8>);
|
||||
}
|
||||
|
||||
pub use range::{RangeReader, RangeWriter};
|
||||
pub use u64_monotonic::{U64MonotonicReader, U64MonotonicWriter};
|
||||
pub use void::{VoidReader, VoidWriter};
|
||||
|
||||
fn deserialize_u64(data: &mut &[u8]) -> u64 {
|
||||
let (num_bytes, val) = super::vint::deserialize_read(data);
|
||||
*data = &data[num_bytes..];
|
||||
val
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::fmt;
|
||||
|
||||
use super::{ValueReader, ValueWriter};
|
||||
|
||||
pub(crate) fn test_value_reader_writer<
|
||||
V: Eq + fmt::Debug,
|
||||
TReader: ValueReader<Value = V>,
|
||||
TWriter: ValueWriter<Value = V>,
|
||||
>(
|
||||
value_block: &[V],
|
||||
) {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut writer = TWriter::default();
|
||||
for value in value_block {
|
||||
writer.write(value);
|
||||
}
|
||||
writer.serialize_block(&mut buffer);
|
||||
}
|
||||
let data_len = buffer.len();
|
||||
buffer.extend_from_slice(&b"extradata"[..]);
|
||||
let mut reader = TReader::default();
|
||||
assert_eq!(reader.load(&buffer[..]).unwrap(), data_len);
|
||||
for (i, val) in value_block.iter().enumerate() {
|
||||
assert_eq!(reader.value(i), val);
|
||||
}
|
||||
}
|
||||
}
|
||||
95
sstable/src/value/range.rs
Normal file
95
sstable/src/value/range.rs
Normal file
@@ -0,0 +1,95 @@
|
||||
use std::io;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::value::{deserialize_u64, ValueReader, ValueWriter};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct RangeReader {
|
||||
vals: Vec<Range<u64>>,
|
||||
}
|
||||
|
||||
impl ValueReader for RangeReader {
|
||||
type Value = Range<u64>;
|
||||
|
||||
fn value(&self, idx: usize) -> &Range<u64> {
|
||||
&self.vals[idx]
|
||||
}
|
||||
|
||||
fn load(&mut self, mut data: &[u8]) -> io::Result<usize> {
|
||||
self.vals.clear();
|
||||
let original_num_bytes = data.len();
|
||||
let len = deserialize_u64(&mut data) as usize;
|
||||
if len != 0 {
|
||||
let mut prev_val = deserialize_u64(&mut data);
|
||||
for _ in 1..len {
|
||||
let next_val = prev_val + deserialize_u64(&mut data);
|
||||
self.vals.push(prev_val..next_val);
|
||||
prev_val = next_val;
|
||||
}
|
||||
}
|
||||
Ok(original_num_bytes - data.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct RangeWriter {
|
||||
vals: Vec<u64>,
|
||||
}
|
||||
|
||||
impl ValueWriter for RangeWriter {
|
||||
type Value = Range<u64>;
|
||||
|
||||
fn write(&mut self, val: &Range<u64>) {
|
||||
if let Some(previous_offset) = self.vals.last().copied() {
|
||||
assert_eq!(previous_offset, val.start);
|
||||
self.vals.push(val.end);
|
||||
} else {
|
||||
self.vals.push(val.start);
|
||||
self.vals.push(val.end)
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_block(&mut self, writer: &mut Vec<u8>) {
|
||||
let mut prev_val = 0u64;
|
||||
crate::vint::serialize_into_vec(self.vals.len() as u64, writer);
|
||||
for &val in &self.vals {
|
||||
let delta = val - prev_val;
|
||||
crate::vint::serialize_into_vec(delta, writer);
|
||||
prev_val = val;
|
||||
}
|
||||
self.vals.clear();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_range_reader_writer() {
|
||||
crate::value::tests::test_value_reader_writer::<_, RangeReader, RangeWriter>(&[]);
|
||||
crate::value::tests::test_value_reader_writer::<_, RangeReader, RangeWriter>(&[0..3]);
|
||||
crate::value::tests::test_value_reader_writer::<_, RangeReader, RangeWriter>(&[
|
||||
0..3,
|
||||
3..10,
|
||||
]);
|
||||
crate::value::tests::test_value_reader_writer::<_, RangeReader, RangeWriter>(&[
|
||||
0..0,
|
||||
0..10,
|
||||
]);
|
||||
crate::value::tests::test_value_reader_writer::<_, RangeReader, RangeWriter>(&[
|
||||
100..110,
|
||||
110..121,
|
||||
121..1250,
|
||||
]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_range_reader_writer_panics() {
|
||||
crate::value::tests::test_value_reader_writer::<_, RangeReader, RangeWriter>(&[
|
||||
1..3,
|
||||
4..10,
|
||||
]);
|
||||
}
|
||||
}
|
||||
73
sstable/src/value/u64_monotonic.rs
Normal file
73
sstable/src/value/u64_monotonic.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::io;
|
||||
|
||||
use crate::value::{deserialize_u64, ValueReader, ValueWriter};
|
||||
use crate::vint;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct U64MonotonicReader {
|
||||
vals: Vec<u64>,
|
||||
}
|
||||
|
||||
impl ValueReader for U64MonotonicReader {
|
||||
type Value = u64;
|
||||
|
||||
fn value(&self, idx: usize) -> &Self::Value {
|
||||
&self.vals[idx]
|
||||
}
|
||||
|
||||
fn load(&mut self, mut data: &[u8]) -> io::Result<usize> {
|
||||
let original_num_bytes = data.len();
|
||||
let num_vals = deserialize_u64(&mut data) as usize;
|
||||
self.vals.clear();
|
||||
let mut prev_val = 0u64;
|
||||
for _ in 0..num_vals {
|
||||
let delta = deserialize_u64(&mut data);
|
||||
let val = prev_val + delta;
|
||||
self.vals.push(val);
|
||||
prev_val = val;
|
||||
}
|
||||
Ok(original_num_bytes - data.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct U64MonotonicWriter {
|
||||
vals: Vec<u64>,
|
||||
}
|
||||
|
||||
impl ValueWriter for U64MonotonicWriter {
|
||||
type Value = u64;
|
||||
|
||||
fn write(&mut self, val: &Self::Value) {
|
||||
self.vals.push(*val);
|
||||
}
|
||||
|
||||
fn serialize_block(&mut self, output: &mut Vec<u8>) {
|
||||
let mut prev_val = 0u64;
|
||||
vint::serialize_into_vec(self.vals.len() as u64, output);
|
||||
for &val in &self.vals {
|
||||
let delta = val - prev_val;
|
||||
vint::serialize_into_vec(delta, output);
|
||||
prev_val = val;
|
||||
}
|
||||
self.vals.clear();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_u64_monotonic_reader_writer() {
|
||||
crate::value::tests::test_value_reader_writer::<_, U64MonotonicReader, U64MonotonicWriter>(
|
||||
&[],
|
||||
);
|
||||
crate::value::tests::test_value_reader_writer::<_, U64MonotonicReader, U64MonotonicWriter>(
|
||||
&[5],
|
||||
);
|
||||
crate::value::tests::test_value_reader_writer::<_, U64MonotonicReader, U64MonotonicWriter>(
|
||||
&[1u64, 30u64],
|
||||
);
|
||||
}
|
||||
}
|
||||
41
sstable/src/value/void.rs
Normal file
41
sstable/src/value/void.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
use std::io;
|
||||
|
||||
use crate::value::{ValueReader, ValueWriter};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VoidReader;
|
||||
|
||||
impl ValueReader for VoidReader {
|
||||
type Value = ();
|
||||
|
||||
fn value(&self, _idx: usize) -> &() {
|
||||
&()
|
||||
}
|
||||
|
||||
fn load(&mut self, _data: &[u8]) -> io::Result<usize> {
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VoidWriter;
|
||||
|
||||
impl ValueWriter for VoidWriter {
|
||||
type Value = ();
|
||||
|
||||
fn write(&mut self, _val: &()) {}
|
||||
|
||||
fn serialize_block(&mut self, _output: &mut Vec<u8>) {}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_range_reader_writer() {
|
||||
crate::value::tests::test_value_reader_writer::<_, VoidReader, VoidWriter>(&[]);
|
||||
crate::value::tests::test_value_reader_writer::<_, VoidReader, VoidWriter>(&[()]);
|
||||
crate::value::tests::test_value_reader_writer::<_, VoidReader, VoidWriter>(&[(), (), ()]);
|
||||
}
|
||||
}
|
||||
@@ -6,4 +6,4 @@ edition = "2021"
|
||||
[dependencies]
|
||||
murmurhash32 = "0.2"
|
||||
byteorder = "1"
|
||||
common = { version = "0.4", path = "../common/", package = "tantivy-common" }
|
||||
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
|
||||
|
||||
Reference in New Issue
Block a user