mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 05:00:41 +00:00
Review comments
This commit is contained in:
@@ -10,7 +10,6 @@ serde_json = "1"
|
||||
thiserror = "1"
|
||||
fnv = "1"
|
||||
sstable = { path = "../sstable", package = "tantivy-sstable" }
|
||||
zstd = "0.12"
|
||||
common = { path = "../common", package = "tantivy-common" }
|
||||
fastfield_codecs = { path = "../fastfield_codecs"}
|
||||
itertools = "0.10"
|
||||
|
||||
@@ -16,42 +16,36 @@ and different cardinality `(required, optional, multivalued)`.
|
||||
|
||||
# Coercion rules
|
||||
|
||||
Users can create a columnar by appending rows to a writer.
|
||||
Nothing prevents a user from recording values with different to a same `column_key`.
|
||||
Users can create a columnar by inserting rows to a `ColumnarWriter`,
|
||||
and serializing it into a `Write` object.
|
||||
Nothing prevents a user from recording values with different type to the same `column_name`.
|
||||
|
||||
In that case, `tantivy-columnar`'s behavior is as follows:
|
||||
- Values that corresponds to different JsonValue type are mapped to different columns. For instance, String values are treated independently from Number or boolean values. `tantivy-columnar` will simply emit several columns associated to a given column_name.
|
||||
- Only one column for a given json value type is emitted. If number values with different number types are recorded (e.g. u64, i64, f64), `tantivy-columnar` will pick the first type that can represents the set of appended value, with the following prioriy order (`i64`, `u64`, `f64`). `i64` is picked over `u64` as it is likely to yield less change of types. Most use cases strictly requiring `u64` show the restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot of use cases can show rare negative value.
|
||||
- JsonValues are grouped into 3 types (String, Number, bool).
|
||||
Values that corresponds to different groups are mapped to different columns. For instance, String values are treated independently
|
||||
from Number or boolean values. `tantivy-columnar` will simply emit several columns associated to a given column_name.
|
||||
- Only one column for a given json value type is emitted. If number values with different number types are recorded (e.g. u64, i64, f64),
|
||||
`tantivy-columnar` will pick the first type that can represents the set of appended value, with the following prioriy order (`i64`, `u64`, `f64`).
|
||||
`i64` is picked over `u64` as it is likely to yield less change of types. Most use cases strictly requiring `u64` show the
|
||||
restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot of use cases can show rare negative value.
|
||||
|
||||
# Columnar format
|
||||
|
||||
Because this columnar format tries to avoid some coercion.
|
||||
There can be several columns (with different type) associated to a single `column_name`.
|
||||
|
||||
Each column is associated to `column_key`.
|
||||
The format of that key is:
|
||||
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
|
||||
The `(column_name, columne_type)` couple however uniquely identifies a column.
|
||||
That couple is serialized as a column `column_key`. The format of that key is:
|
||||
`[column_name][ZERO_BYTE][column_type_header: u8]`
|
||||
|
||||
```
|
||||
COLUMNAR:=
|
||||
[COLUMNAR_DATA]
|
||||
[COLUMNAR_INDEX]
|
||||
[COLUMNAR_KEY_TO_DATA_INDEX]
|
||||
[COLUMNAR_FOOTER];
|
||||
|
||||
|
||||
# Columns are sorted by their column key.
|
||||
COLUMNAR_DATA:=
|
||||
[COLUMN]+;
|
||||
|
||||
COLUMN:=
|
||||
COMPRESSED_COLUMN | NON_COMPRESSED_COLUMN;
|
||||
|
||||
# COLUMN_DATA is compressed when it exceeds a threshold of 100KB.
|
||||
|
||||
COMPRESSED_COLUMN := [b'1'][zstd(COLUMN_DATA)]
|
||||
NON_COMPRESSED_COLUMN:= [b'0'][COLUMN_DATA]
|
||||
|
||||
COLUMNAR_INDEX := [RANGE_SSTABLE_BYTES]
|
||||
[COLUMN_DATA]+;
|
||||
|
||||
COLUMNAR_FOOTER := [RANGE_SSTABLE_BYTES_LEN: 8 bytes little endian]
|
||||
|
||||
@@ -63,7 +57,7 @@ sorted by column key.
|
||||
A sstable associates
|
||||
`(column names, column_cardinality, column_type) to range of bytes.
|
||||
|
||||
Column name may not contain the zero byte.
|
||||
Column name may not contain the zero byte `\0`.
|
||||
|
||||
Listing all columns associated to `column_name` can therefore
|
||||
be done by listing all keys prefixed by
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
use crate::utils::{place_bits, select_bits};
|
||||
use crate::value::NumericalType;
|
||||
use crate::InvalidData;
|
||||
|
||||
/// Enum describing the number of values that can exist per document
|
||||
/// (or per row if you will).
|
||||
///
|
||||
/// The cardinality must fit on 2 bits.
|
||||
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[repr(u8)]
|
||||
pub enum Cardinality {
|
||||
@@ -20,16 +23,20 @@ impl Cardinality {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub(crate) fn try_from_code(code: u8) -> Option<Cardinality> {
|
||||
pub(crate) fn try_from_code(code: u8) -> Result<Cardinality, InvalidData> {
|
||||
match code {
|
||||
0 => Some(Cardinality::Required),
|
||||
1 => Some(Cardinality::Optional),
|
||||
2 => Some(Cardinality::Multivalued),
|
||||
_ => None,
|
||||
0 => Ok(Cardinality::Required),
|
||||
1 => Ok(Cardinality::Optional),
|
||||
2 => Ok(Cardinality::Multivalued),
|
||||
_ => Err(InvalidData),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The column type represents the column type and can fit on 6-bits.
|
||||
///
|
||||
/// - bits[0..3]: Column category type.
|
||||
/// - bits[3..6]: Numerical type if necessary.
|
||||
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
pub enum ColumnType {
|
||||
Bytes,
|
||||
@@ -40,73 +47,79 @@ pub enum ColumnType {
|
||||
impl ColumnType {
|
||||
/// Encoded over 6 bits.
|
||||
pub(crate) fn to_code(self) -> u8 {
|
||||
let high_type;
|
||||
let low_code: u8;
|
||||
let column_type_category;
|
||||
let numerical_type_code: u8;
|
||||
match self {
|
||||
ColumnType::Bytes => {
|
||||
high_type = GeneralType::Str;
|
||||
low_code = 0u8;
|
||||
column_type_category = ColumnTypeCategory::Str;
|
||||
numerical_type_code = 0u8;
|
||||
}
|
||||
ColumnType::Numerical(numerical_type) => {
|
||||
high_type = GeneralType::Numerical;
|
||||
low_code = numerical_type.to_code();
|
||||
column_type_category = ColumnTypeCategory::Numerical;
|
||||
numerical_type_code = numerical_type.to_code();
|
||||
}
|
||||
ColumnType::Bool => {
|
||||
high_type = GeneralType::Bool;
|
||||
low_code = 0u8;
|
||||
column_type_category = ColumnTypeCategory::Bool;
|
||||
numerical_type_code = 0u8;
|
||||
}
|
||||
}
|
||||
place_bits::<3, 6>(high_type.to_code()) | place_bits::<0, 3>(low_code)
|
||||
place_bits::<0, 3>(column_type_category.to_code()) | place_bits::<3, 6>(numerical_type_code)
|
||||
}
|
||||
|
||||
pub(crate) fn try_from_code(code: u8) -> Option<ColumnType> {
|
||||
pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
|
||||
if select_bits::<6, 8>(code) != 0u8 {
|
||||
return None;
|
||||
return Err(InvalidData);
|
||||
}
|
||||
let high_code = select_bits::<3, 6>(code);
|
||||
let low_code = select_bits::<0, 3>(code);
|
||||
let high_type = GeneralType::try_from_code(high_code)?;
|
||||
match high_type {
|
||||
GeneralType::Bool => {
|
||||
if low_code != 0u8 {
|
||||
return None;
|
||||
let column_type_category_code = select_bits::<0, 3>(code);
|
||||
let numerical_type_code = select_bits::<3, 6>(code);
|
||||
let column_type_category = ColumnTypeCategory::try_from_code(column_type_category_code)?;
|
||||
match column_type_category {
|
||||
ColumnTypeCategory::Bool => {
|
||||
if numerical_type_code != 0u8 {
|
||||
return Err(InvalidData);
|
||||
}
|
||||
Some(ColumnType::Bool)
|
||||
Ok(ColumnType::Bool)
|
||||
}
|
||||
GeneralType::Str => {
|
||||
if low_code != 0u8 {
|
||||
return None;
|
||||
ColumnTypeCategory::Str => {
|
||||
if numerical_type_code != 0u8 {
|
||||
return Err(InvalidData);
|
||||
}
|
||||
Some(ColumnType::Bytes)
|
||||
Ok(ColumnType::Bytes)
|
||||
}
|
||||
GeneralType::Numerical => {
|
||||
let numerical_type = NumericalType::try_from_code(low_code)?;
|
||||
Some(ColumnType::Numerical(numerical_type))
|
||||
ColumnTypeCategory::Numerical => {
|
||||
let numerical_type = NumericalType::try_from_code(numerical_type_code)?;
|
||||
Ok(ColumnType::Numerical(numerical_type))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This corresponds to the JsonType.
|
||||
/// Column types are grouped into different categories that
|
||||
/// corresponds to the different types of `JsonValue` types.
|
||||
///
|
||||
/// The columnar writer will apply coercion rules to make sure that
|
||||
/// at most one column exist per `ColumnTypeCategory`.
|
||||
///
|
||||
/// See also [README.md].
|
||||
#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
|
||||
#[repr(u8)]
|
||||
pub(crate) enum GeneralType {
|
||||
pub(crate) enum ColumnTypeCategory {
|
||||
Bool = 0u8,
|
||||
Str = 1u8,
|
||||
Numerical = 2u8,
|
||||
}
|
||||
|
||||
impl GeneralType {
|
||||
impl ColumnTypeCategory {
|
||||
pub fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub fn try_from_code(code: u8) -> Option<Self> {
|
||||
pub fn try_from_code(code: u8) -> Result<Self, InvalidData> {
|
||||
match code {
|
||||
0u8 => Some(Self::Bool),
|
||||
1u8 => Some(Self::Str),
|
||||
2u8 => Some(Self::Numerical),
|
||||
_ => None,
|
||||
0u8 => Ok(Self::Bool),
|
||||
1u8 => Ok(Self::Str),
|
||||
2u8 => Ok(Self::Numerical),
|
||||
_ => Err(InvalidData),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -115,12 +128,12 @@ impl GeneralType {
|
||||
/// This is encoded over one-byte and added to a column key in the
|
||||
/// columnar sstable.
|
||||
///
|
||||
/// Cardinality is encoded as the first two highest two bits.
|
||||
/// The low 6 bits encode the column type.
|
||||
/// - [0..6] bits: encodes the column type
|
||||
/// - [6..8] bits: encodes the cardinality
|
||||
#[derive(Eq, Hash, PartialEq, Debug, Copy, Clone)]
|
||||
pub struct ColumnTypeAndCardinality {
|
||||
pub cardinality: Cardinality,
|
||||
pub typ: ColumnType,
|
||||
pub cardinality: Cardinality,
|
||||
}
|
||||
|
||||
impl ColumnTypeAndCardinality {
|
||||
@@ -128,13 +141,13 @@ impl ColumnTypeAndCardinality {
|
||||
place_bits::<6, 8>(self.cardinality.to_code()) | place_bits::<0, 6>(self.typ.to_code())
|
||||
}
|
||||
|
||||
pub fn try_from_code(code: u8) -> Option<ColumnTypeAndCardinality> {
|
||||
pub fn try_from_code(code: u8) -> Result<ColumnTypeAndCardinality, InvalidData> {
|
||||
let typ_code = select_bits::<0, 6>(code);
|
||||
let cardinality_code = select_bits::<6, 8>(code);
|
||||
let cardinality = Cardinality::try_from_code(cardinality_code)?;
|
||||
let typ = ColumnType::try_from_code(typ_code)?;
|
||||
assert_eq!(typ.to_code(), typ_code);
|
||||
Some(ColumnTypeAndCardinality { cardinality, typ })
|
||||
Ok(ColumnTypeAndCardinality { cardinality, typ })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,7 +162,7 @@ mod tests {
|
||||
fn test_column_type_header_to_code() {
|
||||
let mut column_type_header_set: HashSet<ColumnTypeAndCardinality> = HashSet::new();
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
if let Some(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) {
|
||||
if let Ok(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) {
|
||||
assert_eq!(column_type_header.to_code(), code);
|
||||
assert!(column_type_header_set.insert(column_type_header));
|
||||
}
|
||||
@@ -165,7 +178,7 @@ mod tests {
|
||||
fn test_column_type_to_code() {
|
||||
let mut column_type_set: HashSet<ColumnType> = HashSet::new();
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
if let Some(column_type) = ColumnType::try_from_code(code) {
|
||||
if let Ok(column_type) = ColumnType::try_from_code(code) {
|
||||
assert_eq!(column_type.to_code(), code);
|
||||
assert!(column_type_set.insert(column_type));
|
||||
}
|
||||
@@ -177,8 +190,7 @@ mod tests {
|
||||
fn test_cardinality_to_code() {
|
||||
let mut num_cardinality = 0;
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
let cardinality_opt = Cardinality::try_from_code(code);
|
||||
if let Some(cardinality) = cardinality_opt {
|
||||
if let Ok(cardinality) = Cardinality::try_from_code(code) {
|
||||
assert_eq!(cardinality.to_code(), code);
|
||||
num_cardinality += 1;
|
||||
}
|
||||
|
||||
@@ -3,11 +3,11 @@ use std::io;
|
||||
use fnv::FnvHashMap;
|
||||
use sstable::SSTable;
|
||||
|
||||
pub(crate) struct IdMapping {
|
||||
pub(crate) struct TermIdMapping {
|
||||
unordered_to_ord: Vec<OrderedId>,
|
||||
}
|
||||
|
||||
impl IdMapping {
|
||||
impl TermIdMapping {
|
||||
pub fn to_ord(&self, unordered: UnorderedId) -> OrderedId {
|
||||
self.unordered_to_ord[unordered.0 as usize]
|
||||
}
|
||||
@@ -48,7 +48,7 @@ impl DictionaryBuilder {
|
||||
|
||||
/// Serialize the dictionary into an fst, and returns the
|
||||
/// `UnorderedId -> TermOrdinal` map.
|
||||
pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<IdMapping> {
|
||||
pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<TermIdMapping> {
|
||||
let mut terms: Vec<(&[u8], UnorderedId)> =
|
||||
self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect();
|
||||
terms.sort_unstable_by_key(|(key, _)| *key);
|
||||
@@ -61,7 +61,7 @@ impl DictionaryBuilder {
|
||||
unordered_to_ord[unordered_id.0 as usize] = ordered_id;
|
||||
}
|
||||
sstable_builder.finish()?;
|
||||
Ok(IdMapping { unordered_to_ord })
|
||||
Ok(TermIdMapping { unordered_to_ord })
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,9 @@ pub use writer::ColumnarWriter;
|
||||
|
||||
pub type DocId = u32;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct InvalidData;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Range;
|
||||
@@ -26,8 +29,8 @@ mod tests {
|
||||
#[test]
|
||||
fn test_dataframe_writer_bytes() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_str(1u32, "my_string", b"hello");
|
||||
dataframe_writer.record_str(3u32, "my_string", b"helloeee");
|
||||
dataframe_writer.record_str(1u32, "my_string", "hello");
|
||||
dataframe_writer.record_str(3u32, "my_string", "helloeee");
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar_fileslice = FileSlice::from(buffer);
|
||||
@@ -36,7 +39,7 @@ mod tests {
|
||||
let cols: Vec<(ColumnTypeAndCardinality, Range<u64>)> =
|
||||
columnar.read_columns("my_string").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].1, 0..159);
|
||||
assert_eq!(cols[0].1, 0..158);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -58,7 +61,7 @@ mod tests {
|
||||
typ: ColumnType::Bool
|
||||
}
|
||||
);
|
||||
assert_eq!(cols[0].1, 0..22);
|
||||
assert_eq!(cols[0].1, 0..21);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -81,6 +84,6 @@ mod tests {
|
||||
// - vals 8 //< due to padding? could have been 1byte?.
|
||||
// - null footer 6 bytes
|
||||
// - version footer 3 bytes // Should be file-wide
|
||||
assert_eq!(cols[0].1, 0..32);
|
||||
assert_eq!(cols[0].1, 0..31);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ use crate::column_type_header::ColumnTypeAndCardinality;
|
||||
|
||||
fn io_invalid_data(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::InvalidData, msg)
|
||||
// {key_bytes:?}")));
|
||||
// {key_bytes:?}")));
|
||||
}
|
||||
|
||||
/// The ColumnarReader makes it possible to access a set of columns
|
||||
@@ -50,7 +50,7 @@ impl ColumnarReader {
|
||||
let key_bytes: &[u8] = stream.key();
|
||||
let column_code: u8 = key_bytes.last().cloned().unwrap();
|
||||
let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code)
|
||||
.ok_or_else(|| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
|
||||
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
|
||||
let range = stream.value().clone();
|
||||
let column_name = String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 1]);
|
||||
let range_len = range.end - range.start;
|
||||
@@ -64,15 +64,26 @@ impl ColumnarReader {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Get all columns for the given field_name.
|
||||
/// Get all columns for the given column name.
|
||||
///
|
||||
/// There can be more than one column associated to a given column name, provided they have
|
||||
/// different types.
|
||||
// TODO fix ugly API
|
||||
pub fn read_columns(
|
||||
&self,
|
||||
field_name: &str,
|
||||
column_name: &str,
|
||||
) -> io::Result<Vec<(ColumnTypeAndCardinality, Range<u64>)>> {
|
||||
let mut start_key = field_name.to_string();
|
||||
// Each column is a associated to a given `column_key`,
|
||||
// that starts by `column_name\0column_header`.
|
||||
//
|
||||
// Listing the columns associate to the given column name is therefore equivalent to listing
|
||||
// `column_key` with the prefix `column_name\0`.
|
||||
//
|
||||
// This is in turn equivalent to searching for the range
|
||||
// `[column_name,\0`..column_name\1)`.
|
||||
let mut start_key = column_name.to_string();
|
||||
start_key.push('\0');
|
||||
let mut end_key = field_name.to_string();
|
||||
let mut end_key = column_name.to_string();
|
||||
end_key.push(1u8 as char);
|
||||
let mut stream = self
|
||||
.column_dictionary
|
||||
@@ -83,12 +94,10 @@ impl ColumnarReader {
|
||||
let mut results = Vec::new();
|
||||
while stream.advance() {
|
||||
let key_bytes: &[u8] = stream.key();
|
||||
if !key_bytes.starts_with(start_key.as_bytes()) {
|
||||
return Err(io_invalid_data(format!("Invalid key found. key: {key_bytes:?} field_name:{field_name:?}")));
|
||||
}
|
||||
assert!(key_bytes.starts_with(start_key.as_bytes()));
|
||||
let column_code: u8 = key_bytes.last().cloned().unwrap();
|
||||
let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code)
|
||||
.ok_or_else(|| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
|
||||
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
|
||||
let range = stream.value().clone();
|
||||
results.push((column_type_and_cardinality, range));
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use crate::InvalidData;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub enum NumericalValue {
|
||||
I64(i64),
|
||||
@@ -49,12 +51,12 @@ impl NumericalType {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub fn try_from_code(code: u8) -> Option<NumericalType> {
|
||||
pub fn try_from_code(code: u8) -> Result<NumericalType, InvalidData> {
|
||||
match code {
|
||||
0 => Some(NumericalType::I64),
|
||||
1 => Some(NumericalType::U64),
|
||||
2 => Some(NumericalType::F64),
|
||||
_ => None,
|
||||
0 => Ok(NumericalType::I64),
|
||||
1 => Ok(NumericalType::U64),
|
||||
2 => Ok(NumericalType::F64),
|
||||
_ => Err(InvalidData),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -112,7 +114,7 @@ mod tests {
|
||||
fn test_numerical_type_code() {
|
||||
let mut num_numerical_type = 0;
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
if let Some(numerical_type) = NumericalType::try_from_code(code) {
|
||||
if let Ok(numerical_type) = NumericalType::try_from_code(code) {
|
||||
assert_eq!(numerical_type.to_code(), code);
|
||||
num_numerical_type += 1;
|
||||
}
|
||||
|
||||
@@ -11,17 +11,13 @@ use fastfield_codecs::{Column, MonotonicallyMappableToU64, VecColumn};
|
||||
use serializer::ColumnarSerializer;
|
||||
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
||||
|
||||
use crate::column_type_header::{ColumnType, ColumnTypeAndCardinality, GeneralType};
|
||||
use crate::dictionary::{DictionaryBuilder, IdMapping, UnorderedId};
|
||||
use crate::column_type_header::{ColumnType, ColumnTypeAndCardinality, ColumnTypeCategory};
|
||||
use crate::dictionary::{DictionaryBuilder, TermIdMapping, UnorderedId};
|
||||
use crate::value::{Coerce, NumericalType, NumericalValue};
|
||||
use crate::writer::column_writers::{ColumnWriter, NumericalColumnWriter, StrColumnWriter};
|
||||
use crate::writer::value_index::{IndexBuilder, SpareIndexBuilders};
|
||||
use crate::{Cardinality, DocId};
|
||||
|
||||
/// Threshold above which a column data will be compressed
|
||||
/// using ZSTD.
|
||||
const COLUMN_COMPRESSION_THRESHOLD: usize = 100_000;
|
||||
|
||||
/// This is a set of buffers that are only here
|
||||
/// to limit the amount of allocation.
|
||||
#[derive(Default)]
|
||||
@@ -34,6 +30,20 @@ struct SpareBuffers {
|
||||
column_buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Makes it possible to create a new columnar.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy_columnar::ColumnarWriter;
|
||||
/// fn main() {
|
||||
/// let mut columnar_writer = ColumnarWriter::default();
|
||||
/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack");
|
||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64);
|
||||
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
|
||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
|
||||
/// let mut wrt: Vec<u8> = Vec::new();
|
||||
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
|
||||
/// }
|
||||
/// ```
|
||||
pub struct ColumnarWriter {
|
||||
numerical_field_hash_map: ArenaHashMap,
|
||||
bool_field_hash_map: ArenaHashMap,
|
||||
@@ -58,11 +68,11 @@ impl Default for ColumnarWriter {
|
||||
}
|
||||
|
||||
impl ColumnarWriter {
|
||||
pub fn record_numerical(
|
||||
pub fn record_numerical<T: Into<NumericalValue> + Copy>(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
column_name: &str,
|
||||
numerical_value: NumericalValue,
|
||||
numerical_value: T,
|
||||
) {
|
||||
assert!(
|
||||
!column_name.as_bytes().contains(&0u8),
|
||||
@@ -73,7 +83,7 @@ impl ColumnarWriter {
|
||||
column_name.as_bytes(),
|
||||
|column_opt: Option<NumericalColumnWriter>| {
|
||||
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
|
||||
column.record_numerical_value(doc, numerical_value, arena);
|
||||
column.record_numerical_value(doc, numerical_value.into(), arena);
|
||||
column
|
||||
},
|
||||
);
|
||||
@@ -95,7 +105,7 @@ impl ColumnarWriter {
|
||||
);
|
||||
}
|
||||
|
||||
pub fn record_str(&mut self, doc: DocId, column_name: &str, value: &[u8]) {
|
||||
pub fn record_str(&mut self, doc: DocId, column_name: &str, value: &str) {
|
||||
assert!(
|
||||
!column_name.as_bytes().contains(&0u8),
|
||||
"key may not contain the 0 byte"
|
||||
@@ -113,7 +123,7 @@ impl ColumnarWriter {
|
||||
dictionaries.push(DictionaryBuilder::default());
|
||||
StrColumnWriter::with_dictionary_id(dictionary_id)
|
||||
});
|
||||
column.record_bytes(doc, value, dictionaries, arena);
|
||||
column.record_bytes(doc, value.as_bytes(), dictionaries, arena);
|
||||
column
|
||||
},
|
||||
);
|
||||
@@ -121,27 +131,27 @@ impl ColumnarWriter {
|
||||
|
||||
pub fn serialize(&mut self, num_docs: DocId, wrt: &mut dyn io::Write) -> io::Result<()> {
|
||||
let mut serializer = ColumnarSerializer::new(wrt);
|
||||
let mut field_columns: Vec<(&[u8], GeneralType, Addr)> = self
|
||||
let mut field_columns: Vec<(&[u8], ColumnTypeCategory, Addr)> = self
|
||||
.numerical_field_hash_map
|
||||
.iter()
|
||||
.map(|(term, addr, _)| (term, GeneralType::Numerical, addr))
|
||||
.map(|(term, addr, _)| (term, ColumnTypeCategory::Numerical, addr))
|
||||
.collect();
|
||||
field_columns.extend(
|
||||
self.bytes_field_hash_map
|
||||
.iter()
|
||||
.map(|(term, addr, _)| (term, GeneralType::Str, addr)),
|
||||
.map(|(term, addr, _)| (term, ColumnTypeCategory::Str, addr)),
|
||||
);
|
||||
field_columns.extend(
|
||||
self.bool_field_hash_map
|
||||
.iter()
|
||||
.map(|(term, addr, _)| (term, GeneralType::Bool, addr)),
|
||||
.map(|(term, addr, _)| (term, ColumnTypeCategory::Bool, addr)),
|
||||
);
|
||||
field_columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
|
||||
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
||||
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
|
||||
for (column_name, bytes_or_numerical, addr) in field_columns {
|
||||
match bytes_or_numerical {
|
||||
GeneralType::Bool => {
|
||||
ColumnTypeCategory::Bool => {
|
||||
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
|
||||
let cardinality = column_writer.get_cardinality(num_docs);
|
||||
let column_type_and_cardinality = ColumnTypeAndCardinality {
|
||||
@@ -158,7 +168,7 @@ impl ColumnarWriter {
|
||||
column_serializer,
|
||||
)?;
|
||||
}
|
||||
GeneralType::Str => {
|
||||
ColumnTypeCategory::Str => {
|
||||
let str_column_writer: StrColumnWriter = self.bytes_field_hash_map.read(addr);
|
||||
let dictionary_builder =
|
||||
&dictionaries[str_column_writer.dictionary_id as usize];
|
||||
@@ -178,7 +188,7 @@ impl ColumnarWriter {
|
||||
column_serializer,
|
||||
)?;
|
||||
}
|
||||
GeneralType::Numerical => {
|
||||
ColumnTypeCategory::Numerical => {
|
||||
let numerical_column_writer: NumericalColumnWriter =
|
||||
self.numerical_field_hash_map.read(addr);
|
||||
let (numerical_type, cardinality) =
|
||||
@@ -206,15 +216,7 @@ impl ColumnarWriter {
|
||||
}
|
||||
|
||||
fn compress_and_write_column<W: io::Write>(column_bytes: &[u8], wrt: &mut W) -> io::Result<()> {
|
||||
if column_bytes.len() >= COLUMN_COMPRESSION_THRESHOLD {
|
||||
wrt.write_all(&[1])?;
|
||||
let mut encoder = zstd::Encoder::new(wrt, 3)?;
|
||||
encoder.write_all(column_bytes)?;
|
||||
encoder.finish()?;
|
||||
} else {
|
||||
wrt.write_all(&[0])?;
|
||||
wrt.write_all(column_bytes)?;
|
||||
}
|
||||
wrt.write_all(column_bytes)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -233,13 +235,13 @@ fn serialize_bytes_column<W: io::Write>(
|
||||
..
|
||||
} = buffers;
|
||||
column_buffer.clear();
|
||||
let id_mapping: IdMapping = dictionary_builder.serialize(column_buffer)?;
|
||||
let term_id_mapping: TermIdMapping = dictionary_builder.serialize(column_buffer)?;
|
||||
let dictionary_num_bytes: u32 = column_buffer.len() as u32;
|
||||
let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| {
|
||||
// We map unordered ids to ordered ids.
|
||||
match symbol {
|
||||
ColumnOperation::Value(unordered_id) => {
|
||||
let ordered_id = id_mapping.to_ord(unordered_id);
|
||||
let ordered_id = term_id_mapping.to_ord(unordered_id);
|
||||
ColumnOperation::Value(ordered_id.0 as u64)
|
||||
}
|
||||
ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),
|
||||
|
||||
Reference in New Issue
Block a user