Updated DateTime to hold timestamp in microseconds, while making date field precision configurable (#1396)

This commit is contained in:
Evance Soumaoro
2022-07-12 01:04:28 +00:00
committed by GitHub
parent 2406d9278b
commit a4be239d38
25 changed files with 625 additions and 100 deletions

View File

@@ -1,3 +1,10 @@
Tantivy 0.19
================================
- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
The `DateTime` type has been updated to hold timestamps with microseconds precision.
`DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing).
Tantivy 0.18
================================
- For date values `chrono` has been replaced with `time` (@uklotzde) #1304 :

View File

@@ -49,7 +49,7 @@ thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = "0.5.0"
murmurhash32 = "0.2.0"
time = { version = "0.3.9", features = ["serde-well-known"] }
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.7.5"

View File

@@ -0,0 +1,69 @@
// # DateTime field example
//
// This example shows how the DateTime field can be used
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
let mut schema_builder = Schema::builder();
let opts = DateOptions::from(INDEXED)
.set_stored()
.set_fast(Cardinality::SingleValue)
.set_precision(tantivy::DatePrecision::Seconds);
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);
let schema = schema_builder.build();
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer(50_000_000)?;
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T12:53:50.53Z",
"event": "pull-request"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T13:00:00.22Z",
"event": "comment"
}"#,
)?;
index_writer.add_document(doc)?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
// # Default fields: event_type
let query_parser = QueryParser::for_index(&index, vec![event_type]);
{
let query = query_parser.parse_query("event:comment")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
assert_eq!(count_docs.len(), 1);
}
{
let query = query_parser
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc(doc_address)?;
assert!(matches!(
retrieved_doc.get_first(occurred_at),
Some(Value::Date(_))
));
assert_eq!(
schema.to_json(&retrieved_doc),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
);
}
}
Ok(())
}

View File

@@ -14,7 +14,7 @@ pub struct BitpackedFastFieldReader {
pub max_value_u64: u64,
}
impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
impl FastFieldCodecReader for BitpackedFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);

View File

@@ -575,7 +575,7 @@ mod test {
for special_char in SPECIAL_CHARS.iter() {
let query = &format!("\\{special_char}my\\{special_char}field:a");
assert_eq!(
super::field_name().parse(&query),
super::field_name().parse(query),
Ok((format!("{special_char}my{special_char}field"), "a"))
);
}

View File

@@ -36,7 +36,10 @@ pub struct IntermediateAggregationResults {
impl IntermediateAggregationResults {
/// Convert intermediate result and its aggregation request to the final result.
pub fn into_final_bucket_result(self, req: Aggregations) -> crate::Result<AggregationResults> {
pub(crate) fn into_final_bucket_result(
self,
req: Aggregations,
) -> crate::Result<AggregationResults> {
self.into_final_bucket_result_internal(&(req.into()))
}

View File

@@ -72,8 +72,7 @@ impl HistogramComputer {
return;
}
let delta = value - self.min_value;
let delta_u64 = delta.to_u64();
let bucket_id: usize = self.divider.divide(delta_u64) as usize;
let bucket_id: usize = self.divider.divide(delta) as usize;
if bucket_id < self.counts.len() {
self.counts[bucket_id] += 1;
}
@@ -287,7 +286,7 @@ mod tests {
DateTime::from_primitive(
Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?,
),
3600 * 24 * 365, // it is just for a unit test... sorry leap years.
3_600_000_000 * 24 * 365, // it is just for a unit test... sorry leap years.
10,
);
let week_histogram = searcher.search(&all_query, &week_histogram_collector)?;

View File

@@ -52,11 +52,13 @@ pub trait MultiValueLength {
fn get_total_len(&self) -> u64;
}
/// Trait for types that are allowed for fast fields: (u64, i64 and f64).
/// Trait for types that are allowed for fast fields:
/// (u64, i64 and f64, bool, DateTime).
pub trait FastValue: Clone + Copy + Send + Sync + PartialOrd + 'static {
/// Converts a value from u64
///
/// Internally all fast field values are encoded as u64.
/// **Note: To be used for converting encoded Term, Posting values.**
fn from_u64(val: u64) -> Self;
/// Converts a value to u64.
@@ -189,24 +191,27 @@ impl FastValue for bool {
}
impl FastValue for DateTime {
fn from_u64(timestamp_u64: u64) -> Self {
let unix_timestamp = i64::from_u64(timestamp_u64);
Self::from_unix_timestamp(unix_timestamp)
/// Converts a timestamp microseconds into DateTime.
///
/// **Note the timestamps is expected to be in microseconds.**
fn from_u64(timestamp_micros_u64: u64) -> Self {
let timestamp_micros = i64::from_u64(timestamp_micros_u64);
Self::from_timestamp_micros(timestamp_micros)
}
fn to_u64(&self) -> u64 {
self.into_unix_timestamp().to_u64()
common::i64_to_u64(self.into_timestamp_micros())
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type {
FieldType::Date(ref integer_options) => integer_options.get_fastfield_cardinality(),
FieldType::Date(ref options) => options.get_fastfield_cardinality(),
_ => None,
}
}
fn as_u64(&self) -> u64 {
self.into_unix_timestamp().as_u64()
self.into_timestamp_micros().as_u64()
}
fn to_type() -> Type {
@@ -261,9 +266,9 @@ mod tests {
use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{Document, Field, NumericOptions, Schema, FAST, STRING, TEXT};
use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT};
use crate::time::OffsetDateTime;
use crate::{Index, SegmentId, SegmentReader};
use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
@@ -559,8 +564,8 @@ mod tests {
}
#[test]
fn test_default_datetime() {
assert_eq!(0, DateTime::make_zero().into_unix_timestamp());
fn test_default_date() {
assert_eq!(0, DateTime::make_zero().into_timestamp_secs());
}
fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
@@ -766,10 +771,15 @@ mod tests {
fn test_datefastfield() -> crate::Result<()> {
use crate::fastfield::FastValue;
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", FAST);
let date_field = schema_builder.add_date_field(
"date",
DateOptions::from(FAST).set_precision(DatePrecision::Microseconds),
);
let multi_date_field = schema_builder.add_date_field(
"multi_date",
NumericOptions::default().set_fast(Cardinality::MultiValues),
DateOptions::default()
.set_precision(DatePrecision::Microseconds)
.set_fast(Cardinality::MultiValues),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -797,23 +807,23 @@ mod tests {
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
let mut dates = vec![];
{
assert_eq!(date_fast_field.get(0u32).into_unix_timestamp(), 1i64);
assert_eq!(date_fast_field.get(0u32).into_timestamp_micros(), 1i64);
dates_fast_field.get_vals(0u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_unix_timestamp(), 2i64);
assert_eq!(dates[1].into_unix_timestamp(), 3i64);
assert_eq!(dates[0].into_timestamp_micros(), 2i64);
assert_eq!(dates[1].into_timestamp_micros(), 3i64);
}
{
assert_eq!(date_fast_field.get(1u32).into_unix_timestamp(), 4i64);
assert_eq!(date_fast_field.get(1u32).into_timestamp_micros(), 4i64);
dates_fast_field.get_vals(1u32, &mut dates);
assert!(dates.is_empty());
}
{
assert_eq!(date_fast_field.get(2u32).into_unix_timestamp(), 0i64);
assert_eq!(date_fast_field.get(2u32).into_timestamp_micros(), 0i64);
dates_fast_field.get_vals(2u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_unix_timestamp(), 5i64);
assert_eq!(dates[1].into_unix_timestamp(), 6i64);
assert_eq!(dates[0].into_timestamp_micros(), 5i64);
assert_eq!(dates[1].into_timestamp_micros(), 6i64);
}
Ok(())
}

View File

@@ -13,7 +13,7 @@ mod tests {
use crate::collector::TopDocs;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
use crate::schema::{Cardinality, DateOptions, Facet, FacetOptions, NumericOptions, Schema};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{Duration, OffsetDateTime};
use crate::{DateTime, Document, Index, Term};
@@ -58,7 +58,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"multi_date_field",
NumericOptions::default()
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_fieldnorm()

View File

@@ -4,12 +4,12 @@ use fnv::FnvHashMap;
use tantivy_bitpacker::minmax;
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType};
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType, FastValue};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field};
use crate::schema::{Document, Field, Value};
use crate::termdict::TermOrdinal;
use crate::DocId;
use crate::{DatePrecision, DocId};
/// Writer for multi-valued (as in, more than one value per document)
/// int fast field.
@@ -36,6 +36,7 @@ use crate::DocId;
/// term ids when the segment is getting serialized.
pub struct MultiValuedFastFieldWriter {
field: Field,
precision_opt: Option<DatePrecision>,
vals: Vec<UnorderedTermId>,
doc_index: Vec<u64>,
fast_field_type: FastFieldType,
@@ -43,9 +44,14 @@ pub struct MultiValuedFastFieldWriter {
impl MultiValuedFastFieldWriter {
/// Creates a new `MultiValuedFastFieldWriter`
pub(crate) fn new(field: Field, fast_field_type: FastFieldType) -> Self {
pub(crate) fn new(
field: Field,
fast_field_type: FastFieldType,
precision_opt: Option<DatePrecision>,
) -> Self {
MultiValuedFastFieldWriter {
field,
precision_opt,
vals: Vec::new(),
doc_index: Vec::new(),
fast_field_type,
@@ -83,7 +89,14 @@ impl MultiValuedFastFieldWriter {
}
for field_value in doc.field_values() {
if field_value.field == self.field {
self.add_val(value_to_u64(field_value.value()));
let value = field_value.value();
let value_u64 = match (self.precision_opt, value) {
(Some(precision), Value::Date(date_val)) => {
date_val.truncate(precision).to_u64()
}
_ => value_to_u64(value),
};
self.add_val(value_u64);
}
}
}

View File

@@ -7,12 +7,13 @@ use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::{FastFieldDataAccess, FastFieldType};
use super::{FastFieldDataAccess, FastFieldType, FastValue};
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value};
use crate::termdict::TermOrdinal;
use crate::DatePrecision;
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
@@ -43,31 +44,51 @@ impl FastFieldsWriter {
FieldType::I64(ref int_options)
| FieldType::U64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options)
| FieldType::Date(ref int_options) => {
| FieldType::Bool(ref int_options) => {
match int_options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer = IntFastFieldWriter::new(field);
let mut fast_field_writer = IntFastFieldWriter::new(field, None);
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Numeric);
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
None,
);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer =
IntFastFieldWriter::new(field, Some(options.get_precision()));
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
Some(options.get_precision()),
);
multi_values_writers.push(fast_field_writer);
}
None => {}
},
FieldType::Facet(_) => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet);
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Str(_) if field_entry.is_fast() => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::String);
MultiValuedFastFieldWriter::new(field, FastFieldType::String, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Bytes(bytes_option) => {
@@ -230,6 +251,7 @@ impl FastFieldsWriter {
/// using `common::i64_to_u64` and `common::f64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
precision_opt: Option<DatePrecision>,
vals: BlockedBitpacker,
val_count: usize,
val_if_missing: u64,
@@ -239,9 +261,10 @@ pub struct IntFastFieldWriter {
impl IntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field) -> IntFastFieldWriter {
pub fn new(field: Field, precision_opt: Option<DatePrecision>) -> IntFastFieldWriter {
IntFastFieldWriter {
field,
precision_opt,
vals: BlockedBitpacker::new(),
val_count: 0,
val_if_missing: 0u64,
@@ -305,7 +328,13 @@ impl IntFastFieldWriter {
pub fn add_document(&mut self, doc: &Document) {
match doc.get_first(self.field) {
Some(v) => {
self.add_val(super::value_to_u64(v));
let value = match (self.precision_opt, v) {
(Some(precision), Value::Date(date_val)) => {
date_val.truncate(precision).to_u64()
}
_ => super::value_to_u64(v),
};
self.add_val(value);
}
None => {
self.add_val(self.val_if_missing);

View File

@@ -8,7 +8,7 @@ use crate::schema::{Field, Type};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
use crate::tokenizer::TextAnalyzer;
use crate::{DateTime, DocId, Term};
use crate::{DatePrecision, DateTime, DocId, Term};
/// This object is a map storing the last position for a given path for the current document
/// being indexed.
@@ -323,9 +323,16 @@ impl<'a> JsonTermWriter<'a> {
pub fn set_fast_value<T: FastValue>(&mut self, val: T) {
self.close_path_and_set_type(T::to_type());
let value = if T::to_type() == Type::Date {
DateTime::from_u64(val.to_u64())
.truncate(DatePrecision::Seconds)
.to_u64()
} else {
val.to_u64()
};
self.term_buffer
.as_mut()
.extend_from_slice(val.to_u64().to_be_bytes().as_slice());
.extend_from_slice(value.to_be_bytes().as_slice());
}
#[cfg(test)]

View File

@@ -298,8 +298,16 @@ impl IndexMerger {
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::F64(ref options)
| FieldType::Bool(ref options)
| FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
| FieldType::Bool(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(field, fast_field_serializer, doc_id_mapping)?;
}
Some(Cardinality::MultiValues) => {
self.write_multi_fast_field(field, fast_field_serializer, doc_id_mapping)?;
}
None => {}
},
FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(field, fast_field_serializer, doc_id_mapping)?;
}

View File

@@ -14,7 +14,7 @@ use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer,
};
use crate::{DocId, Document, Opstamp, SegmentComponent};
use crate::{DatePrecision, DocId, Document, Opstamp, SegmentComponent};
/// Computes the initial size of the hash table.
///
@@ -248,7 +248,7 @@ impl SegmentWriter {
FieldType::Date(_) => {
for value in values {
let date_val = value.as_date().ok_or_else(make_schema_error)?;
term_buffer.set_u64(date_val.to_u64());
term_buffer.set_u64(date_val.truncate(DatePrecision::Seconds).to_u64());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
}

View File

@@ -133,7 +133,7 @@ pub use time;
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
/// A date/time value with second precision.
/// A date/time value with microsecond precision.
///
/// This timestamp does not carry any explicit time zone information.
/// Users are responsible for applying the provided conversion
@@ -145,13 +145,30 @@ use crate::time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
/// to prevent unintended usage.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct DateTime {
unix_timestamp: i64,
// Timestamp in microseconds.
pub(crate) timestamp_micros: i64,
}
impl DateTime {
/// Create new from UNIX timestamp
pub const fn from_unix_timestamp(unix_timestamp: i64) -> Self {
Self { unix_timestamp }
/// Create new from UNIX timestamp in seconds
pub const fn from_timestamp_secs(seconds: i64) -> Self {
Self {
timestamp_micros: seconds * 1_000_000,
}
}
/// Create new from UNIX timestamp in milliseconds
pub const fn from_timestamp_millis(milliseconds: i64) -> Self {
Self {
timestamp_micros: milliseconds * 1_000,
}
}
/// Create new from UNIX timestamp in microseconds.
pub const fn from_timestamp_micros(microseconds: i64) -> Self {
Self {
timestamp_micros: microseconds,
}
}
/// Create new from `OffsetDateTime`
@@ -159,7 +176,8 @@ impl DateTime {
/// The given date/time is converted to UTC and the actual
/// time zone is discarded.
pub const fn from_utc(dt: OffsetDateTime) -> Self {
Self::from_unix_timestamp(dt.unix_timestamp())
let timestamp_micros = dt.unix_timestamp() as i64 * 1_000_000 + dt.microsecond() as i64;
Self { timestamp_micros }
}
/// Create new from `PrimitiveDateTime`
@@ -167,21 +185,30 @@ impl DateTime {
/// Implicitly assumes that the given date/time is in UTC!
/// Otherwise the original value must only be reobtained with
/// [`Self::into_primitive()`].
pub const fn from_primitive(dt: PrimitiveDateTime) -> Self {
pub fn from_primitive(dt: PrimitiveDateTime) -> Self {
Self::from_utc(dt.assume_utc())
}
/// Convert to UNIX timestamp
pub const fn into_unix_timestamp(self) -> i64 {
let Self { unix_timestamp } = self;
unix_timestamp
/// Convert to UNIX timestamp in seconds.
pub const fn into_timestamp_secs(self) -> i64 {
self.timestamp_micros / 1_000_000
}
/// Convert to UNIX timestamp in milliseconds.
pub const fn into_timestamp_millis(self) -> i64 {
self.timestamp_micros / 1_000
}
/// Convert to UNIX timestamp in microseconds.
pub const fn into_timestamp_micros(self) -> i64 {
self.timestamp_micros
}
/// Convert to UTC `OffsetDateTime`
pub fn into_utc(self) -> OffsetDateTime {
let Self { unix_timestamp } = self;
let utc_datetime =
OffsetDateTime::from_unix_timestamp(unix_timestamp).expect("valid UNIX timestamp");
let timestamp_nanos = self.timestamp_micros as i128 * 1000;
let utc_datetime = OffsetDateTime::from_unix_timestamp_nanos(timestamp_nanos)
.expect("valid UNIX timestamp");
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
utc_datetime
}
@@ -201,6 +228,18 @@ impl DateTime {
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
PrimitiveDateTime::new(utc_datetime.date(), utc_datetime.time())
}
/// Truncates the microseconds value to the corresponding precision.
pub(crate) fn truncate(self, precision: DatePrecision) -> Self {
let truncated_timestamp_micros = match precision {
DatePrecision::Seconds => (self.timestamp_micros / 1_000_000) * 1_000_000,
DatePrecision::Milliseconds => (self.timestamp_micros / 1_000) * 1_000,
DatePrecision::Microseconds => self.timestamp_micros,
};
Self {
timestamp_micros: truncated_timestamp_micros,
}
}
}
impl fmt::Debug for DateTime {
@@ -269,7 +308,7 @@ pub use crate::indexer::operation::UserOperation;
pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit};
pub use crate::postings::Postings;
pub use crate::reader::LeasedItem;
pub use crate::schema::{Document, Term};
pub use crate::schema::{DateOptions, DatePrecision, Document, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 4;
@@ -385,6 +424,7 @@ pub mod tests {
use rand::distributions::{Bernoulli, Uniform};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use time::OffsetDateTime;
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader;
@@ -393,7 +433,7 @@ pub mod tests {
use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery;
use crate::schema::*;
use crate::{DocAddress, Index, Postings, ReloadPolicy};
use crate::{DateTime, DocAddress, Index, Postings, ReloadPolicy};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
@@ -1102,4 +1142,35 @@ pub mod tests {
assert!(index.validate_checksum()?.is_empty());
Ok(())
}
#[test]
fn test_datetime() {
let now = OffsetDateTime::now_utc();
let dt = DateTime::from_utc(now).into_utc();
assert_eq!(dt.to_ordinal_date(), now.to_ordinal_date());
assert_eq!(dt.to_hms_micro(), now.to_hms_micro());
// We don't store nanosecond level precision.
assert_ne!(dt.to_hms_nano(), now.to_hms_nano());
let dt = DateTime::from_timestamp_secs(now.unix_timestamp()).into_utc();
assert_eq!(dt.to_ordinal_date(), now.to_ordinal_date());
assert_eq!(dt.to_hms(), now.to_hms());
// Constructed from a second precision.
assert_ne!(dt.to_hms_micro(), now.to_hms_micro());
let dt =
DateTime::from_timestamp_micros((now.unix_timestamp_nanos() / 1_000) as i64).into_utc();
assert_eq!(dt.to_ordinal_date(), now.to_ordinal_date());
assert_eq!(dt.to_hms_micro(), now.to_hms_micro());
let dt_from_ts_nanos =
OffsetDateTime::from_unix_timestamp_nanos(18446744073709551615i128).unwrap();
let offset_dt = DateTime::from_utc(dt_from_ts_nanos).into_utc();
assert_eq!(
dt_from_ts_nanos.to_ordinal_date(),
offset_dt.to_ordinal_date()
);
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
}
}

View File

@@ -243,13 +243,12 @@ impl MoreLikeThis {
}
FieldType::Date(_) => {
for value in values {
// TODO: Ask if this is the semantic (timestamp) we want
let unix_timestamp = value
let timestamp_micros = value
.as_date()
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
.into_unix_timestamp();
if !self.is_noise_word(unix_timestamp.to_string()) {
let term = Term::from_field_i64(field, unix_timestamp);
.into_timestamp_micros();
if !self.is_noise_word(timestamp_micros.to_string()) {
let term = Term::from_field_i64(field, timestamp_micros);
*term_frequencies.entry(term).or_insert(0) += 1;
}
}

View File

@@ -1068,7 +1068,6 @@ mod test {
#[test]
fn test_json_field_possibly_a_date() {
// Subseconds are discarded
test_parse_query_to_logical_ast_helper(
r#"json.date:"2019-10-12T07:20:50.52Z""#,
r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#,
@@ -1352,9 +1351,16 @@ mod test {
query_parser.parse_query("date:18a"),
Err(QueryParserError::DateFormatError(_))
);
assert!(query_parser
.parse_query("date:\"1985-04-12T23:20:50.52Z\"")
.is_ok());
test_parse_query_to_logical_ast_helper(
r#"date:"2010-11-21T09:55:06.000000000+02:00""#,
r#"Term(type=Date, field=9, 2010-11-21T07:55:06Z)"#,
true,
);
test_parse_query_to_logical_ast_helper(
r#"date:"1985-04-12T23:20:50.52Z""#,
r#"Term(type=Date, field=9, 1985-04-12T23:20:50Z)"#,
true,
);
}
#[test]

View File

@@ -0,0 +1,276 @@
use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::Cardinality;
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
/// DateTime Precision
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum DatePrecision {
/// Seconds precision
Seconds,
/// Milli-seconds precision.
Milliseconds,
/// Micro-seconds precision.
Microseconds,
}
impl Default for DatePrecision {
fn default() -> Self {
DatePrecision::Seconds
}
}
/// Defines how DateTime field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct DateOptions {
indexed: bool,
// This boolean has no effect if the field is not marked as indexed true.
fieldnorms: bool,
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
stored: bool,
// Internal storage precision, used to optimize storage
// compression on fast fields.
#[serde(default)]
precision: DatePrecision,
}
impl DateOptions {
/// Returns true iff the value is stored.
pub fn is_stored(&self) -> bool {
self.stored
}
/// Returns true iff the value is indexed and therefore searchable.
pub fn is_indexed(&self) -> bool {
self.indexed
}
/// Returns true iff the field has fieldnorm.
pub fn fieldnorms(&self) -> bool {
self.fieldnorms && self.indexed
}
/// Returns true iff the value is a fast field and multivalue.
pub fn is_multivalue_fast(&self) -> bool {
if let Some(cardinality) = self.fast {
cardinality == Cardinality::MultiValues
} else {
false
}
}
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
}
/// Set the field as stored.
///
/// Only the fields that are set as *stored* are
/// persisted into the Tantivy's store.
#[must_use]
pub fn set_stored(mut self) -> DateOptions {
self.stored = true;
self
}
/// Set the field as indexed.
///
/// Setting an integer as indexed will generate
/// a posting list for each value taken by the integer.
///
/// This is required for the field to be searchable.
#[must_use]
pub fn set_indexed(mut self) -> DateOptions {
self.indexed = true;
self
}
/// Set the field with fieldnorm.
///
/// Setting an integer as fieldnorm will generate
/// the fieldnorm data for it.
#[must_use]
pub fn set_fieldnorm(mut self) -> DateOptions {
self.fieldnorms = true;
self
}
/// Set the field as a single-valued fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// If more than one value is associated to a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> DateOptions {
self.fast = Some(cardinality);
self
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
/// the method returns None.
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
self.fast
}
/// Sets the precision for this DateTime field.
///
/// Internal storage precision, used to optimize storage
/// compression on fast fields.
pub fn set_precision(mut self, precision: DatePrecision) -> DateOptions {
self.precision = precision;
self
}
/// Returns the storage precision for this DateTime field.
///
/// Internal storage precision, used to optimize storage
/// compression on fast fields.
pub fn get_precision(&self) -> DatePrecision {
self.precision
}
}
impl From<()> for DateOptions {
fn from(_: ()) -> DateOptions {
DateOptions::default()
}
}
impl From<FastFlag> for DateOptions {
fn from(_: FastFlag) -> Self {
DateOptions {
indexed: false,
fieldnorms: false,
stored: false,
fast: Some(Cardinality::SingleValue),
..Default::default()
}
}
}
impl From<StoredFlag> for DateOptions {
fn from(_: StoredFlag) -> Self {
DateOptions {
indexed: false,
fieldnorms: false,
stored: true,
fast: None,
..Default::default()
}
}
}
impl From<IndexedFlag> for DateOptions {
fn from(_: IndexedFlag) -> Self {
DateOptions {
indexed: true,
fieldnorms: true,
stored: false,
fast: None,
..Default::default()
}
}
}
impl<T: Into<DateOptions>> BitOr<T> for DateOptions {
type Output = DateOptions;
fn bitor(self, other: T) -> DateOptions {
let other = other.into();
DateOptions {
indexed: self.indexed | other.indexed,
fieldnorms: self.fieldnorms | other.fieldnorms,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
precision: self.precision,
}
}
}
impl<Head, Tail> From<SchemaFlagList<Head, Tail>> for DateOptions
where
Head: Clone,
Tail: Clone,
Self: BitOr<Output = Self> + From<Head> + From<Tail>,
{
fn from(head_tail: SchemaFlagList<Head, Tail>) -> Self {
Self::from(head_tail.head) | Self::from(head_tail.tail)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_date_options_consistent_with_default() {
let date_time_options: DateOptions = serde_json::from_str(
r#"{
"indexed": false,
"fieldnorms": false,
"stored": false
}"#,
)
.unwrap();
assert_eq!(date_time_options, DateOptions::default());
}
#[test]
fn test_serialize_date_option() {
let date_options = serde_json::from_str::<DateOptions>(
r#"
{
"indexed": true,
"fieldnorms": false,
"stored": false,
"precision": "milliseconds"
}"#,
)
.unwrap();
let date_options_json = serde_json::to_value(&date_options).unwrap();
assert_eq!(
date_options_json,
serde_json::json!({
"precision": "milliseconds",
"indexed": true,
"fieldnorms": false,
"stored": false
})
);
}
#[test]
fn test_deserialize_date_options_with_wrong_options() {
assert!(serde_json::from_str::<DateOptions>(
r#"{
"indexed": true,
"fieldnorms": false,
"stored": "wrong_value"
}"#
)
.unwrap_err()
.to_string()
.contains("expected a boolean"));
assert!(serde_json::from_str::<DateOptions>(
r#"{
"indexed": true,
"fieldnorms": false,
"stored": false,
"precision": "hours"
}"#
)
.unwrap_err()
.to_string()
.contains("unknown variant `hours`"));
}
}

View File

@@ -2,7 +2,8 @@ use serde::{Deserialize, Serialize};
use crate::schema::bytes_options::BytesOptions;
use crate::schema::{
is_valid_field_name, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, TextOptions,
is_valid_field_name, DateOptions, FacetOptions, FieldType, JsonObjectOptions, NumericOptions,
TextOptions,
};
/// A `FieldEntry` represents a field and its configuration.
@@ -55,7 +56,7 @@ impl FieldEntry {
}
/// Creates a new date field entry.
pub fn new_date(field_name: String, date_options: NumericOptions) -> FieldEntry {
pub fn new_date(field_name: String, date_options: DateOptions) -> FieldEntry {
Self::new(field_name, FieldType::Date(date_options))
}
@@ -107,8 +108,8 @@ impl FieldEntry {
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::F64(ref options)
| FieldType::Date(ref options)
| FieldType::Bool(ref options) => options.is_stored(),
FieldType::Date(ref options) => options.is_stored(),
FieldType::Str(ref options) => options.is_stored(),
FieldType::Facet(ref options) => options.is_stored(),
FieldType::Bytes(ref options) => options.is_stored(),

View File

@@ -5,8 +5,8 @@ use thiserror::Error;
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::{
Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, TextOptions,
Value,
DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing,
TextOptions, Value,
};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
@@ -27,6 +27,11 @@ pub enum ValueParsingError {
expected: &'static str,
json: serde_json::Value,
},
#[error("Parse error on {json}: {error}")]
ParseError {
error: String,
json: serde_json::Value,
},
#[error("Invalid base64: {base64}")]
InvalidBase64 { base64: String },
}
@@ -133,7 +138,7 @@ pub enum FieldType {
/// Bool field type configuration
Bool(NumericOptions),
/// Signed 64-bits Date 64 field type configuration,
Date(NumericOptions),
Date(DateOptions),
/// Hierachical Facet
Facet(FacetOptions),
/// Bytes (one per document)
@@ -202,8 +207,8 @@ impl FieldType {
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Date(ref int_options)
| FieldType::Bool(ref int_options) => int_options.get_fastfield_cardinality().is_some(),
| FieldType::Bool(ref int_options) => int_options.is_fast(),
FieldType::Date(ref date_options) => date_options.is_fast(),
FieldType::Facet(_) => true,
FieldType::JsonObject(_) => false,
}
@@ -219,8 +224,8 @@ impl FieldType {
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Date(ref int_options)
| FieldType::Bool(ref int_options) => int_options.fieldnorms(),
FieldType::Date(ref date_options) => date_options.fieldnorms(),
FieldType::Facet(_) => false,
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
FieldType::JsonObject(ref _json_object_options) => false,
@@ -243,7 +248,6 @@ impl FieldType {
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Date(ref int_options)
| FieldType::Bool(ref int_options) => {
if int_options.is_indexed() {
Some(IndexRecordOption::Basic)
@@ -251,6 +255,13 @@ impl FieldType {
None
}
}
FieldType::Date(ref date_options) => {
if date_options.is_indexed() {
Some(IndexRecordOption::Basic)
} else {
None
}
}
FieldType::Facet(ref _facet_options) => Some(IndexRecordOption::Basic),
FieldType::Bytes(ref bytes_options) => {
if bytes_options.is_indexed() {
@@ -273,7 +284,7 @@ impl FieldType {
pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> {
match json {
JsonValue::String(field_text) => {
match *self {
match self {
FieldType::Date(_) => {
let dt_with_fixed_tz = OffsetDateTime::parse(&field_text, &Rfc3339)
.map_err(|_err| ValueParsingError::TypeError {
@@ -402,8 +413,8 @@ mod tests {
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
let doc = schema.parse_document(doc_json).unwrap();
let date = doc.get_first(date_field).unwrap();
// Time zone is converted to UTC and subseconds are discarded
assert_eq!("Date(2019-10-12T05:20:50Z)", format!("{:?}", date));
// Time zone is converted to UTC
assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{:?}", date));
}
#[test]

View File

@@ -1,6 +1,7 @@
use std::ops::BitOr;
use crate::schema::{NumericOptions, TextOptions};
use crate::DateOptions;
#[derive(Clone)]
pub struct StoredFlag;
@@ -65,6 +66,14 @@ impl<T: Clone + Into<NumericOptions>> BitOr<NumericOptions> for SchemaFlagList<T
}
}
impl<T: Clone + Into<DateOptions>> BitOr<DateOptions> for SchemaFlagList<T, ()> {
type Output = DateOptions;
fn bitor(self, rhs: DateOptions) -> Self::Output {
self.head.into() | rhs
}
}
impl<T: Clone + Into<TextOptions>> BitOr<TextOptions> for SchemaFlagList<T, ()> {
type Output = TextOptions;

View File

@@ -117,6 +117,7 @@ mod field_type;
mod field_value;
mod bytes_options;
mod date_time_options;
mod field;
mod flags;
mod index_record_option;
@@ -127,6 +128,7 @@ mod text_options;
mod value;
pub use self::bytes_options::BytesOptions;
pub use self::date_time_options::{DateOptions, DatePrecision};
pub use self::document::Document;
pub(crate) use self::facet::FACET_SEP_BYTE;
pub use self::facet::{Facet, FacetParseError};

View File

@@ -134,7 +134,7 @@ impl SchemaBuilder {
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_date_field<T: Into<NumericOptions>>(
pub fn add_date_field<T: Into<DateOptions>>(
&mut self,
field_name_str: &str,
field_options: T,
@@ -813,7 +813,7 @@ mod tests {
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic),
);
let timestamp_options = NumericOptions::default()
let timestamp_options = DateOptions::default()
.set_stored()
.set_indexed()
.set_fieldnorm()
@@ -875,7 +875,8 @@ mod tests {
"indexed": true,
"fieldnorms": true,
"fast": "single",
"stored": true
"stored": true,
"precision": "seconds"
}
},
{

View File

@@ -5,7 +5,7 @@ use std::{fmt, str};
use super::Field;
use crate::fastfield::FastValue;
use crate::schema::{Facet, Type};
use crate::DateTime;
use crate::{DatePrecision, DateTime};
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
/// <field> + <type byte> + <value len>
@@ -76,7 +76,7 @@ impl Term {
/// Builds a term given a field, and a DateTime value
pub fn from_field_date(field: Field, val: DateTime) -> Term {
Term::from_fast_value(field, &val)
Term::from_fast_value(field, &val.truncate(DatePrecision::Seconds))
}
/// Creates a `Term` given a facet.

View File

@@ -24,7 +24,7 @@ pub enum Value {
F64(f64),
/// Bool value
Bool(bool),
/// Date/time with second precision
/// Date/time with microseconds precision
Date(DateTime),
/// Facet
Facet(Facet),
@@ -251,7 +251,7 @@ impl<'a> From<&'a [u8]> for Value {
}
}
impl<'a> From<Facet> for Value {
impl From<Facet> for Value {
fn from(facet: Facet) -> Value {
Value::Facet(facet)
}
@@ -348,8 +348,10 @@ mod binary_serialize {
}
Value::Date(ref val) => {
DATE_CODE.serialize(writer)?;
let DateTime { unix_timestamp } = val;
unix_timestamp.serialize(writer)
let DateTime {
timestamp_micros, ..
} = val;
timestamp_micros.serialize(writer)
}
Value::Facet(ref facet) => {
HIERARCHICAL_FACET_CODE.serialize(writer)?;
@@ -391,8 +393,10 @@ mod binary_serialize {
Ok(Value::Bool(value))
}
DATE_CODE => {
let unix_timestamp = i64::deserialize(reader)?;
Ok(Value::Date(DateTime::from_unix_timestamp(unix_timestamp)))
let timestamp_micros = i64::deserialize(reader)?;
Ok(Value::Date(DateTime::from_timestamp_micros(
timestamp_micros,
)))
}
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),