store DateTime as nanoseconds in doc store (#2486)

* store DateTime as nanoseconds in doc store

The doc store DateTime was truncated to microseconds previously. This
removes this truncation, while still keeping backwards compatibility.

This is done by adding the trait `ConfigurableBinarySerializable`, which
works like `BinarySerializable`, but with a config that allows de/serialize
as different date time precision currently.

bump version format to 7.
add compat test to check the date time truncation.

* remove configurable binary serialize, add enum for doc store version

* test doc store version ord
This commit is contained in:
PSeitz
2024-10-18 10:50:20 +08:00
committed by GitHub
parent d152e29687
commit 2f2db16ec1
22 changed files with 246 additions and 89 deletions

View File

@@ -20,7 +20,7 @@ pub use datetime::{DateTime, DateTimePrecision};
pub use group_by::GroupByIteratorExtended; pub use group_by::GroupByIteratorExtended;
pub use json_path_writer::JsonPathWriter; pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref}; pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use serialize::*;
pub use vint::{ pub use vint::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
}; };

View File

@@ -74,14 +74,14 @@ impl FixedSize for () {
impl<T: BinarySerializable> BinarySerializable for Vec<T> { impl<T: BinarySerializable> BinarySerializable for Vec<T> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?; BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
for it in self { for it in self {
it.serialize(writer)?; it.serialize(writer)?;
} }
Ok(()) Ok(())
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
let num_items = VInt::deserialize(reader)?.val(); let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
let mut items: Vec<T> = Vec::with_capacity(num_items as usize); let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items { for _ in 0..num_items {
let item = T::deserialize(reader)?; let item = T::deserialize(reader)?;
@@ -236,12 +236,12 @@ impl FixedSize for bool {
impl BinarySerializable for String { impl BinarySerializable for String {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes(); let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?; BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
writer.write_all(data) writer.write_all(data)
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = VInt::deserialize(reader)?.val() as usize; let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length); let mut result = String::with_capacity(string_length);
reader reader
.take(string_length as u64) .take(string_length as u64)
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
impl<'a> BinarySerializable for Cow<'a, str> { impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes(); let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?; BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
writer.write_all(data) writer.write_all(data)
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize; let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length); let mut result = String::with_capacity(string_length);
reader reader
.take(string_length as u64) .take(string_length as u64)
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {
impl<'a> BinarySerializable for Cow<'a, [u8]> { impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?; BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
for it in self.iter() { for it in self.iter() {
it.serialize(writer)?; BinarySerializable::serialize(it, writer)?;
} }
Ok(()) Ok(())
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val(); let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize); let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items { for _ in 0..num_items {
let item = u8::deserialize(reader)?; let item = <u8 as BinarySerializable>::deserialize(reader)?;
items.push(item); items.push(item);
} }
Ok(Cow::Owned(items)) Ok(Cow::Owned(items))

View File

@@ -44,8 +44,19 @@ fn test_format_6() {
assert_date_time_precision(&index, DateTimePrecision::Microseconds); assert_date_time_precision(&index, DateTimePrecision::Microseconds);
} }
/// feature flag quickwit uses a different dictionary type
#[test]
#[cfg(not(feature = "quickwit"))] #[cfg(not(feature = "quickwit"))]
fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) { fn test_format_7() {
let path = path_for_version("7");
let index = Index::open_in_dir(path).expect("Failed to open index");
// dates are not truncated in v7 in the docstore
assert_date_time_precision(&index, DateTimePrecision::Nanoseconds);
}
#[cfg(not(feature = "quickwit"))]
fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) {
use collector::TopDocs; use collector::TopDocs;
let reader = index.reader().expect("Failed to create reader"); let reader = index.reader().expect("Failed to create reader");
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -75,6 +86,6 @@ fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
.as_datetime() .as_datetime()
.unwrap(); .unwrap();
let expected = DateTime::from_timestamp_nanos(123456).truncate(precision); let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision);
assert_eq!(date_value, expected,); assert_eq!(date_value, expected,);
} }

View File

@@ -232,7 +232,7 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::schema::{Document, TantivyDocument, Term}; pub use crate::schema::{Document, TantivyDocument, Term};
/// Index format version. /// Index format version.
pub const INDEX_FORMAT_VERSION: u32 = 6; pub const INDEX_FORMAT_VERSION: u32 = 7;
/// Oldest index format version this tantivy version can read. /// Oldest index format version this tantivy version can read.
pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4; pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;

View File

@@ -22,6 +22,7 @@ use super::se::BinaryObjectSerializer;
use super::{OwnedValue, Value}; use super::{OwnedValue, Value};
use crate::schema::document::type_codes; use crate::schema::document::type_codes;
use crate::schema::{Facet, Field}; use crate::schema::{Facet, Field};
use crate::store::DocStoreVersion;
use crate::tokenizer::PreTokenizedString; use crate::tokenizer::PreTokenizedString;
#[derive(Debug, thiserror::Error, Clone)] #[derive(Debug, thiserror::Error, Clone)]
@@ -45,6 +46,9 @@ pub enum DeserializeError {
#[error("{0}")] #[error("{0}")]
/// A custom error message. /// A custom error message.
Custom(String), Custom(String),
#[error("Version {0}, Max version supported: {1}")]
/// Unsupported version error.
UnsupportedVersion(u32, u32),
} }
impl DeserializeError { impl DeserializeError {
@@ -291,6 +295,7 @@ pub trait ObjectAccess<'de> {
pub struct BinaryDocumentDeserializer<'de, R> { pub struct BinaryDocumentDeserializer<'de, R> {
length: usize, length: usize,
position: usize, position: usize,
doc_store_version: DocStoreVersion,
reader: &'de mut R, reader: &'de mut R,
} }
@@ -298,12 +303,16 @@ impl<'de, R> BinaryDocumentDeserializer<'de, R>
where R: Read where R: Read
{ {
/// Attempts to create a new document deserializer from a given reader. /// Attempts to create a new document deserializer from a given reader.
pub(crate) fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> { pub(crate) fn from_reader(
reader: &'de mut R,
doc_store_version: DocStoreVersion,
) -> Result<Self, DeserializeError> {
let length = VInt::deserialize(reader)?; let length = VInt::deserialize(reader)?;
Ok(Self { Ok(Self {
length: length.val() as usize, length: length.val() as usize,
position: 0, position: 0,
doc_store_version,
reader, reader,
}) })
} }
@@ -329,8 +338,8 @@ where R: Read
} }
let field = Field::deserialize(self.reader).map_err(DeserializeError::from)?; let field = Field::deserialize(self.reader).map_err(DeserializeError::from)?;
let deserializer =
let deserializer = BinaryValueDeserializer::from_reader(self.reader)?; BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?;
let value = V::deserialize(deserializer)?; let value = V::deserialize(deserializer)?;
self.position += 1; self.position += 1;
@@ -344,13 +353,17 @@ where R: Read
pub struct BinaryValueDeserializer<'de, R> { pub struct BinaryValueDeserializer<'de, R> {
value_type: ValueType, value_type: ValueType,
reader: &'de mut R, reader: &'de mut R,
doc_store_version: DocStoreVersion,
} }
impl<'de, R> BinaryValueDeserializer<'de, R> impl<'de, R> BinaryValueDeserializer<'de, R>
where R: Read where R: Read
{ {
/// Attempts to create a new value deserializer from a given reader. /// Attempts to create a new value deserializer from a given reader.
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> { fn from_reader(
reader: &'de mut R,
doc_store_version: DocStoreVersion,
) -> Result<Self, DeserializeError> {
let type_code = <u8 as BinarySerializable>::deserialize(reader)?; let type_code = <u8 as BinarySerializable>::deserialize(reader)?;
let value_type = match type_code { let value_type = match type_code {
@@ -391,7 +404,11 @@ where R: Read
} }
}; };
Ok(Self { value_type, reader }) Ok(Self {
value_type,
reader,
doc_store_version,
})
} }
fn validate_type(&self, expected_type: ValueType) -> Result<(), DeserializeError> { fn validate_type(&self, expected_type: ValueType) -> Result<(), DeserializeError> {
@@ -438,7 +455,16 @@ where R: Read
fn deserialize_datetime(self) -> Result<DateTime, DeserializeError> { fn deserialize_datetime(self) -> Result<DateTime, DeserializeError> {
self.validate_type(ValueType::DateTime)?; self.validate_type(ValueType::DateTime)?;
<DateTime as BinarySerializable>::deserialize(self.reader).map_err(DeserializeError::from) match self.doc_store_version {
DocStoreVersion::V1 => {
let timestamp_micros = <i64 as BinarySerializable>::deserialize(self.reader)?;
Ok(DateTime::from_timestamp_micros(timestamp_micros))
}
DocStoreVersion::V2 => {
let timestamp_nanos = <i64 as BinarySerializable>::deserialize(self.reader)?;
Ok(DateTime::from_timestamp_nanos(timestamp_nanos))
}
}
} }
fn deserialize_facet(self) -> Result<Facet, DeserializeError> { fn deserialize_facet(self) -> Result<Facet, DeserializeError> {
@@ -514,11 +540,13 @@ where R: Read
visitor.visit_pre_tokenized_string(val) visitor.visit_pre_tokenized_string(val)
} }
ValueType::Array => { ValueType::Array => {
let access = BinaryArrayDeserializer::from_reader(self.reader)?; let access =
BinaryArrayDeserializer::from_reader(self.reader, self.doc_store_version)?;
visitor.visit_array(access) visitor.visit_array(access)
} }
ValueType::Object => { ValueType::Object => {
let access = BinaryObjectDeserializer::from_reader(self.reader)?; let access =
BinaryObjectDeserializer::from_reader(self.reader, self.doc_store_version)?;
visitor.visit_object(access) visitor.visit_object(access)
} }
#[allow(deprecated)] #[allow(deprecated)]
@@ -537,7 +565,8 @@ where R: Read
let out_rc = std::rc::Rc::new(out); let out_rc = std::rc::Rc::new(out);
let mut slice: &[u8] = &out_rc; let mut slice: &[u8] = &out_rc;
let access = BinaryObjectDeserializer::from_reader(&mut slice)?; let access =
BinaryObjectDeserializer::from_reader(&mut slice, self.doc_store_version)?;
visitor.visit_object(access) visitor.visit_object(access)
} }
@@ -551,19 +580,24 @@ pub struct BinaryArrayDeserializer<'de, R> {
length: usize, length: usize,
position: usize, position: usize,
reader: &'de mut R, reader: &'de mut R,
doc_store_version: DocStoreVersion,
} }
impl<'de, R> BinaryArrayDeserializer<'de, R> impl<'de, R> BinaryArrayDeserializer<'de, R>
where R: Read where R: Read
{ {
/// Attempts to create a new array deserializer from a given reader. /// Attempts to create a new array deserializer from a given reader.
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> { fn from_reader(
reader: &'de mut R,
doc_store_version: DocStoreVersion,
) -> Result<Self, DeserializeError> {
let length = <VInt as BinarySerializable>::deserialize(reader)?; let length = <VInt as BinarySerializable>::deserialize(reader)?;
Ok(Self { Ok(Self {
length: length.val() as usize, length: length.val() as usize,
position: 0, position: 0,
reader, reader,
doc_store_version,
}) })
} }
@@ -587,7 +621,8 @@ where R: Read
return Ok(None); return Ok(None);
} }
let deserializer = BinaryValueDeserializer::from_reader(self.reader)?; let deserializer =
BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?;
let value = V::deserialize(deserializer)?; let value = V::deserialize(deserializer)?;
// Advance the position cursor. // Advance the position cursor.
@@ -610,8 +645,11 @@ impl<'de, R> BinaryObjectDeserializer<'de, R>
where R: Read where R: Read
{ {
/// Attempts to create a new object deserializer from a given reader. /// Attempts to create a new object deserializer from a given reader.
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> { fn from_reader(
let inner = BinaryArrayDeserializer::from_reader(reader)?; reader: &'de mut R,
doc_store_version: DocStoreVersion,
) -> Result<Self, DeserializeError> {
let inner = BinaryArrayDeserializer::from_reader(reader, doc_store_version)?;
Ok(Self { inner }) Ok(Self { inner })
} }
} }
@@ -819,6 +857,7 @@ mod tests {
use crate::schema::document::existing_type_impls::JsonObjectIter; use crate::schema::document::existing_type_impls::JsonObjectIter;
use crate::schema::document::se::BinaryValueSerializer; use crate::schema::document::se::BinaryValueSerializer;
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf};
use crate::store::DOC_STORE_VERSION;
fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> { fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> {
let mut writer = Vec::new(); let mut writer = Vec::new();
@@ -829,9 +868,19 @@ mod tests {
writer writer
} }
fn serialize_owned_value<'a>(value: ReferenceValue<'a, &'a OwnedValue>) -> Vec<u8> {
let mut writer = Vec::new();
let mut serializer = BinaryValueSerializer::new(&mut writer);
serializer.serialize_value(value).expect("Serialize value");
writer
}
fn deserialize_value(buffer: Vec<u8>) -> crate::schema::OwnedValue { fn deserialize_value(buffer: Vec<u8>) -> crate::schema::OwnedValue {
let mut cursor = Cursor::new(buffer); let mut cursor = Cursor::new(buffer);
let deserializer = BinaryValueDeserializer::from_reader(&mut cursor).unwrap(); let deserializer =
BinaryValueDeserializer::from_reader(&mut cursor, DOC_STORE_VERSION).unwrap();
crate::schema::OwnedValue::deserialize(deserializer).expect("Deserialize value") crate::schema::OwnedValue::deserialize(deserializer).expect("Deserialize value")
} }
@@ -1010,6 +1059,17 @@ mod tests {
assert_eq!(value, expected_val); assert_eq!(value, expected_val);
} }
#[test]
fn test_nested_date_precision() {
let object = OwnedValue::Object(vec![(
"my-date".into(),
OwnedValue::Date(DateTime::from_timestamp_nanos(323456)),
)]);
let result = serialize_owned_value((&object).as_value());
let value = deserialize_value(result);
assert_eq!(value, object);
}
#[test] #[test]
fn test_nested_serialize() { fn test_nested_serialize() {
let mut object = serde_json::Map::new(); let mut object = serde_json::Map::new();

View File

@@ -81,6 +81,15 @@ where W: Write
Self { writer } Self { writer }
} }
fn serialize_with_type_code<T: BinarySerializable>(
&mut self,
code: u8,
val: &T,
) -> io::Result<()> {
self.write_type_code(code)?;
BinarySerializable::serialize(val, self.writer)
}
/// Attempts to serialize a given value and write the output /// Attempts to serialize a given value and write the output
/// to the writer. /// to the writer.
pub(crate) fn serialize_value<'a, V>( pub(crate) fn serialize_value<'a, V>(
@@ -94,56 +103,38 @@ where W: Write
ReferenceValue::Leaf(leaf) => match leaf { ReferenceValue::Leaf(leaf) => match leaf {
ReferenceValueLeaf::Null => self.write_type_code(type_codes::NULL_CODE), ReferenceValueLeaf::Null => self.write_type_code(type_codes::NULL_CODE),
ReferenceValueLeaf::Str(val) => { ReferenceValueLeaf::Str(val) => {
self.write_type_code(type_codes::TEXT_CODE)?; self.serialize_with_type_code(type_codes::TEXT_CODE, &Cow::Borrowed(val))
let temp_val = Cow::Borrowed(val);
temp_val.serialize(self.writer)
} }
ReferenceValueLeaf::U64(val) => { ReferenceValueLeaf::U64(val) => {
self.write_type_code(type_codes::U64_CODE)?; self.serialize_with_type_code(type_codes::U64_CODE, &val)
val.serialize(self.writer)
} }
ReferenceValueLeaf::I64(val) => { ReferenceValueLeaf::I64(val) => {
self.write_type_code(type_codes::I64_CODE)?; self.serialize_with_type_code(type_codes::I64_CODE, &val)
val.serialize(self.writer)
} }
ReferenceValueLeaf::F64(val) => { ReferenceValueLeaf::F64(val) => {
self.write_type_code(type_codes::F64_CODE)?; self.serialize_with_type_code(type_codes::F64_CODE, &f64_to_u64(val))
f64_to_u64(val).serialize(self.writer)
} }
ReferenceValueLeaf::Date(val) => { ReferenceValueLeaf::Date(val) => {
self.write_type_code(type_codes::DATE_CODE)?; self.write_type_code(type_codes::DATE_CODE)?;
val.serialize(self.writer) let timestamp_nanos: i64 = val.into_timestamp_nanos();
} BinarySerializable::serialize(&timestamp_nanos, self.writer)
ReferenceValueLeaf::Facet(val) => {
self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?;
Cow::Borrowed(val).serialize(self.writer)
} }
ReferenceValueLeaf::Facet(val) => self.serialize_with_type_code(
type_codes::HIERARCHICAL_FACET_CODE,
&Cow::Borrowed(val),
),
ReferenceValueLeaf::Bytes(val) => { ReferenceValueLeaf::Bytes(val) => {
self.write_type_code(type_codes::BYTES_CODE)?; self.serialize_with_type_code(type_codes::BYTES_CODE, &Cow::Borrowed(val))
let temp_val = Cow::Borrowed(val);
temp_val.serialize(self.writer)
} }
ReferenceValueLeaf::IpAddr(val) => { ReferenceValueLeaf::IpAddr(val) => {
self.write_type_code(type_codes::IP_CODE)?; self.serialize_with_type_code(type_codes::IP_CODE, &val.to_u128())
val.to_u128().serialize(self.writer)
} }
ReferenceValueLeaf::Bool(val) => { ReferenceValueLeaf::Bool(val) => {
self.write_type_code(type_codes::BOOL_CODE)?; self.serialize_with_type_code(type_codes::BOOL_CODE, &val)
val.serialize(self.writer)
} }
ReferenceValueLeaf::PreTokStr(val) => { ReferenceValueLeaf::PreTokStr(val) => {
self.write_type_code(type_codes::EXT_CODE)?; self.write_type_code(type_codes::EXT_CODE)?;
self.write_type_code(type_codes::TOK_STR_EXT_CODE)?; self.serialize_with_type_code(type_codes::TOK_STR_EXT_CODE, &*val)
val.serialize(self.writer)
} }
}, },
ReferenceValue::Array(elements) => { ReferenceValue::Array(elements) => {
@@ -306,7 +297,6 @@ where W: Write
mod tests { mod tests {
use std::collections::BTreeMap; use std::collections::BTreeMap;
use common::DateTime;
use serde_json::Number; use serde_json::Number;
use tokenizer_api::Token; use tokenizer_api::Token;
@@ -337,7 +327,10 @@ mod tests {
$ext_code.serialize(&mut writer).unwrap(); $ext_code.serialize(&mut writer).unwrap();
)? )?
$value.serialize(&mut writer).unwrap(); BinarySerializable::serialize(
&$value,
&mut writer,
).unwrap();
)* )*
writer writer
@@ -355,7 +348,10 @@ mod tests {
$ext_code.serialize(&mut writer).unwrap(); $ext_code.serialize(&mut writer).unwrap();
)? )?
$value.serialize(&mut writer).unwrap(); BinarySerializable::serialize(
&$value,
&mut writer,
).unwrap();
)* )*
writer writer
@@ -418,15 +414,6 @@ mod tests {
"Expected serialized value to match the binary representation" "Expected serialized value to match the binary representation"
); );
let result = serialize_value(ReferenceValueLeaf::Date(DateTime::MAX).into());
let expected = binary_repr!(
type_codes::DATE_CODE => DateTime::MAX,
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let facet = Facet::from_text("/hello/world").unwrap(); let facet = Facet::from_text("/hello/world").unwrap();
let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into()); let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
let expected = binary_repr!( let expected = binary_repr!(

View File

@@ -4,7 +4,7 @@ use std::io::{self, Read, Write};
use std::str; use std::str;
use std::string::FromUtf8Error; use std::string::FromUtf8Error;
use common::BinarySerializable; use common::*;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
use serde::de::Error as _; use serde::de::Error as _;

View File

@@ -2,12 +2,13 @@ use std::io;
use common::{BinarySerializable, FixedSize, HasLen}; use common::{BinarySerializable, FixedSize, HasLen};
use super::{Decompressor, DOC_STORE_VERSION}; use super::{Decompressor, DocStoreVersion, DOC_STORE_VERSION};
use crate::directory::FileSlice; use crate::directory::FileSlice;
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub struct DocStoreFooter { pub struct DocStoreFooter {
pub offset: u64, pub offset: u64,
pub doc_store_version: DocStoreVersion,
pub decompressor: Decompressor, pub decompressor: Decompressor,
} }
@@ -25,9 +26,11 @@ impl BinarySerializable for DocStoreFooter {
} }
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_store_version = u32::deserialize(reader)?; let doc_store_version = DocStoreVersion::deserialize(reader)?;
if doc_store_version != DOC_STORE_VERSION { if doc_store_version > DOC_STORE_VERSION {
panic!("actual doc store version: {doc_store_version}, expected: {DOC_STORE_VERSION}"); panic!(
"actual doc store version: {doc_store_version}, max_supported: {DOC_STORE_VERSION}"
);
} }
let offset = u64::deserialize(reader)?; let offset = u64::deserialize(reader)?;
let compressor_id = u8::deserialize(reader)?; let compressor_id = u8::deserialize(reader)?;
@@ -35,6 +38,7 @@ impl BinarySerializable for DocStoreFooter {
reader.read_exact(&mut skip_buf)?; reader.read_exact(&mut skip_buf)?;
Ok(DocStoreFooter { Ok(DocStoreFooter {
offset, offset,
doc_store_version,
decompressor: Decompressor::from_id(compressor_id), decompressor: Decompressor::from_id(compressor_id),
}) })
} }
@@ -45,9 +49,14 @@ impl FixedSize for DocStoreFooter {
} }
impl DocStoreFooter { impl DocStoreFooter {
pub fn new(offset: u64, decompressor: Decompressor) -> Self { pub fn new(
offset: u64,
decompressor: Decompressor,
doc_store_version: DocStoreVersion,
) -> Self {
DocStoreFooter { DocStoreFooter {
offset, offset,
doc_store_version,
decompressor, decompressor,
} }
} }

View File

@@ -35,15 +35,16 @@ mod footer;
mod index; mod index;
mod reader; mod reader;
mod writer; mod writer;
pub use self::compressors::{Compressor, ZstdCompressor}; pub use self::compressors::{Compressor, ZstdCompressor};
pub use self::decompressors::Decompressor; pub use self::decompressors::Decompressor;
pub(crate) use self::reader::DOCSTORE_CACHE_CAPACITY;
pub use self::reader::{CacheStats, StoreReader}; pub use self::reader::{CacheStats, StoreReader};
pub(crate) use self::reader::{DocStoreVersion, DOCSTORE_CACHE_CAPACITY};
pub use self::writer::StoreWriter; pub use self::writer::StoreWriter;
mod store_compressor; mod store_compressor;
/// Doc store version in footer to handle format changes. /// Doc store version in footer to handle format changes.
pub(crate) const DOC_STORE_VERSION: u32 = 1; pub(crate) const DOC_STORE_VERSION: DocStoreVersion = DocStoreVersion::V2;
#[cfg(feature = "lz4-compression")] #[cfg(feature = "lz4-compression")]
mod compression_lz4_block; mod compression_lz4_block;

View File

@@ -1,3 +1,4 @@
use std::fmt::Display;
use std::io; use std::io;
use std::iter::Sum; use std::iter::Sum;
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
@@ -25,9 +26,43 @@ pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100;
type Block = OwnedBytes; type Block = OwnedBytes;
/// The format version of the document store.
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
pub(crate) enum DocStoreVersion {
V1 = 1,
V2 = 2,
}
impl Display for DocStoreVersion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DocStoreVersion::V1 => write!(f, "V1"),
DocStoreVersion::V2 => write!(f, "V2"),
}
}
}
impl BinarySerializable for DocStoreVersion {
fn serialize<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
(*self as u32).serialize(writer)
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
Ok(match u32::deserialize(reader)? {
1 => DocStoreVersion::V1,
2 => DocStoreVersion::V2,
v => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid doc store version {}", v),
))
}
})
}
}
/// Reads document off tantivy's [`Store`](./index.html) /// Reads document off tantivy's [`Store`](./index.html)
pub struct StoreReader { pub struct StoreReader {
decompressor: Decompressor, decompressor: Decompressor,
doc_store_version: DocStoreVersion,
data: FileSlice, data: FileSlice,
skip_index: Arc<SkipIndex>, skip_index: Arc<SkipIndex>,
space_usage: StoreSpaceUsage, space_usage: StoreSpaceUsage,
@@ -129,6 +164,7 @@ impl StoreReader {
let skip_index = SkipIndex::open(index_data); let skip_index = SkipIndex::open(index_data);
Ok(StoreReader { Ok(StoreReader {
decompressor: footer.decompressor, decompressor: footer.decompressor,
doc_store_version: footer.doc_store_version,
data: data_file, data: data_file,
cache: BlockCache { cache: BlockCache {
cache: NonZeroUsize::new(cache_num_blocks) cache: NonZeroUsize::new(cache_num_blocks)
@@ -203,8 +239,9 @@ impl StoreReader {
pub fn get<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> { pub fn get<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> {
let mut doc_bytes = self.get_document_bytes(doc_id)?; let mut doc_bytes = self.get_document_bytes(doc_id)?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) let deserializer =
.map_err(crate::TantivyError::from)?; BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from) D::deserialize(deserializer).map_err(crate::TantivyError::from)
} }
@@ -244,8 +281,9 @@ impl StoreReader {
self.iter_raw(alive_bitset).map(|doc_bytes_res| { self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?; let mut doc_bytes = doc_bytes_res?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) let deserializer =
.map_err(crate::TantivyError::from)?; BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from) D::deserialize(deserializer).map_err(crate::TantivyError::from)
}) })
} }
@@ -391,8 +429,9 @@ impl StoreReader {
) -> crate::Result<D> { ) -> crate::Result<D> {
let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?; let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) let deserializer =
.map_err(crate::TantivyError::from)?; BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from) D::deserialize(deserializer).map_err(crate::TantivyError::from)
} }
} }
@@ -414,6 +453,11 @@ mod tests {
doc.get_first(*field).and_then(|f| f.as_value().as_str()) doc.get_first(*field).and_then(|f| f.as_value().as_str())
} }
#[test]
fn test_doc_store_version_ord() {
assert!(DocStoreVersion::V1 < DocStoreVersion::V2);
}
#[test] #[test]
fn test_store_lru_cache() -> crate::Result<()> { fn test_store_lru_cache() -> crate::Result<()> {
let directory = RamDirectory::create(); let directory = RamDirectory::create();

View File

@@ -5,6 +5,7 @@ use std::{io, thread};
use common::{BinarySerializable, CountingWriter, TerminatingWrite}; use common::{BinarySerializable, CountingWriter, TerminatingWrite};
use super::DOC_STORE_VERSION;
use crate::directory::WritePtr; use crate::directory::WritePtr;
use crate::store::footer::DocStoreFooter; use crate::store::footer::DocStoreFooter;
use crate::store::index::{Checkpoint, SkipIndexBuilder}; use crate::store::index::{Checkpoint, SkipIndexBuilder};
@@ -143,8 +144,11 @@ impl BlockCompressorImpl {
fn close(mut self) -> io::Result<()> { fn close(mut self) -> io::Result<()> {
let header_offset: u64 = self.writer.written_bytes(); let header_offset: u64 = self.writer.written_bytes();
let docstore_footer = let docstore_footer = DocStoreFooter::new(
DocStoreFooter::new(header_offset, Decompressor::from(self.compressor)); header_offset,
Decompressor::from(self.compressor),
DOC_STORE_VERSION,
);
self.offset_index_writer.serialize_into(&mut self.writer)?; self.offset_index_writer.serialize_into(&mut self.writer)?;
docstore_footer.serialize(&mut self.writer)?; docstore_footer.serialize(&mut self.writer)?;
self.writer.terminate() self.writer.terminate()

View File

@@ -2,7 +2,7 @@ use std::cmp::Ordering;
use std::io; use std::io;
use std::io::{Read, Write}; use std::io::{Read, Write};
use common::BinarySerializable; use common::*;
use crate::tokenizer::{Token, TokenStream}; use crate::tokenizer::{Token, TokenStream};

View File

@@ -0,0 +1 @@
["meta.json","000002f0000000000000000000000000.fieldnorm","000002f0000000000000000000000000.pos","000002f0000000000000000000000000.store","000002f0000000000000000000000000.term","000002f0000000000000000000000000.fast","000002f0000000000000000000000000.idx"]

View File

@@ -0,0 +1,40 @@
{
"index_settings": {
"docstore_compression": "lz4",
"docstore_blocksize": 16384
},
"segments": [
{
"segment_id": "000002f0-0000-0000-0000-000000000000",
"max_doc": 1,
"deletes": null
}
],
"schema": [
{
"name": "label",
"type": "text",
"options": {
"indexing": {
"record": "position",
"fieldnorms": true,
"tokenizer": "default"
},
"stored": true,
"fast": false
}
},
{
"name": "date",
"type": "date",
"options": {
"indexed": true,
"fieldnorms": true,
"fast": false,
"stored": true,
"precision": "seconds"
}
}
],
"opstamp": 2
}