WIP: date field (#487)

* initial version, still a work in progress

* remove redudant or

* add chrono::DateTime and index i64

* add more tests

* fix tests

* pass DateTime by ptr

* remove println!

* document query_parser rfc 3339 date support

* added some more docs about implementation to schema.rs

* enforce DateTime is UTC, and re-export chrono

* added DateField to changelog

* fixed conflict

* use INDEXED instead of INT_INDEXED for date fields
This commit is contained in:
barrotsteindev
2019-03-15 15:10:37 +02:00
committed by Paul Masurel
parent 94f1885334
commit a934577168
14 changed files with 259 additions and 10 deletions

View File

@@ -11,6 +11,7 @@ previous index format.*
In addition, adds are guaranteed to happen on the same segment. (@elbow-jason)
- Removed `INT_STORED` and `INT_INDEXED`. It is now possible to use `STORED` and `INDEXED`
for int fields. (@fulmicoton)
- Added DateTime field (@barrotsteindev)
Tantivy 0.8.2

View File

@@ -50,6 +50,7 @@ htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
chrono = "0.4"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -58,6 +59,7 @@ winapi = "0.2"
rand = "0.6"
maplit = "1"
matches = "0.1.8"
time = "0.1.42"
[profile.release]
opt-level = 3

View File

@@ -7,11 +7,16 @@ pub use self::writer::MultiValueIntFastFieldWriter;
#[cfg(test)]
mod tests {
extern crate time;
use query::QueryParser;
use collector::TopDocs;
use schema::Cardinality;
use schema::Facet;
use schema::IntOptions;
use schema::Schema;
use Index;
use self::time::Duration;
#[test]
fn test_multivalued_u64() {
@@ -48,6 +53,104 @@ mod tests {
}
}
#[test]
fn test_multivalued_date() {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"multi_date_field",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_stored(),
);
let time_i = schema_builder.add_i64_field(
"time_stamp_i",
IntOptions::default()
.set_stored(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let first_time_stamp = chrono::Utc::now();
index_writer.add_document(doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64));
index_writer.add_document(doc!(time_i=>0i64));
// add one second
index_writer.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
assert!(index_writer.commit().is_ok());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 4);
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
.expect("could not parse query");
let results = searcher.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(), first_time_stamp.timestamp());
assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), 1i64);
}
}
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
.expect("could not parse query");
let results = searcher.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(), two_secs_ahead.timestamp());
assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), 3i64);
}
}
// TODO: support Date range queries
// {
// let parser = QueryParser::for_index(&index, vec![date_field]);
// let range_q = format!("\"{}\"..\"{}\"",
// (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
// (first_time_stamp + Duration::seconds(3)).to_rfc3339()
// );
// let query = parser.parse_query(&range_q)
// .expect("could not parse query");
// let results = searcher.search(&query, &TopDocs::with_limit(5))
// .expect("could not query index");
//
//
// assert_eq!(results.len(), 2);
// for (i, doc_pair) in results.iter().enumerate() {
// let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
// let offset_sec = match i {
// 0 => 1,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// let time_i_val = match i {
// 0 => 2,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(),
// (first_time_stamp + Duration::seconds(offset_sec)).timestamp());
// assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), time_i_val);
// }
// }
}
#[test]
fn test_multivalued_i64() {
let mut schema_builder = Schema::builder();

View File

@@ -194,7 +194,7 @@ impl IndexMerger {
fast_field_serializer,
)?;
}
FieldType::U64(ref options) | FieldType::I64(ref options) => {
FieldType::U64(ref options) | FieldType::I64(ref options) | FieldType::Date(ref options) => {
match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(field, fast_field_serializer)?;
@@ -671,10 +671,12 @@ mod tests {
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", INDEXED);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build());
let curr_time = chrono::Utc::now();
let add_score_bytes = |doc: &mut Document, score: u32| {
let mut bytes = Vec::new();
@@ -692,6 +694,7 @@ mod tests {
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u64(score_field, 3);
doc.add_date(date_field, &curr_time);
add_score_bytes(&mut doc, 3);
index_writer.add_document(doc);
}
@@ -717,6 +720,7 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_date(date_field, &curr_time);
doc.add_u64(score_field, 11);
add_score_bytes(&mut doc, 11);
index_writer.add_document(doc);
@@ -774,6 +778,13 @@ mod tests {
DocAddress(0, 4)
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)]),
vec![
DocAddress(0, 0),
DocAddress(0, 3)
]
);
}
{
let doc = searcher.doc(DocAddress(0, 0)).unwrap();

View File

@@ -171,6 +171,17 @@ impl SegmentWriter {
}
}
}
FieldType::Date(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().date_value().timestamp(),
);
self.multifield_postings.subscribe(doc_id, &term);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {

View File

@@ -187,10 +187,14 @@ pub use error::TantivyError as Error;
extern crate census;
extern crate owned_read;
pub extern crate chrono;
/// Tantivy result.
pub type Result<T> = std::result::Result<T, error::TantivyError>;
/// Tantivy DateTime
pub type DateTime = chrono::DateTime<chrono::Utc>;
mod common;
mod core;
mod indexer;

View File

@@ -33,7 +33,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
FieldType::Bytes => {
@@ -148,7 +148,7 @@ impl MultiFieldPostingsWriter {
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) => {}
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
FieldType::Bytes => {}
}

View File

@@ -50,6 +50,8 @@ pub enum QueryParserError {
/// The query contains a range query with a phrase as one of the bounds.
/// Only terms can be used as bounds.
RangeMustNotHavePhrase,
/// The format for the date field is not RFC 3339 compliant.
DateFormatError(chrono::ParseError),
}
impl From<ParseIntError> for QueryParserError {
@@ -58,6 +60,12 @@ impl From<ParseIntError> for QueryParserError {
}
}
impl From<chrono::ParseError> for QueryParserError {
fn from(err: chrono::ParseError) -> QueryParserError {
QueryParserError::DateFormatError(err)
}
}
/// Recursively remove empty clause from the AST
///
/// Returns `None` iff the `logical_ast` ended up being empty.
@@ -127,6 +135,8 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
/// a word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
/// Inclusive bounds are `[]`, exclusive are `{}`.
///
/// * date values: The query parser supports rfc3339 formatted dates. For example "2002-10-02T15:00:00.05Z"
///
/// * all docs query: A plain `*` will match all documents in the index.
///
#[derive(Clone)]
@@ -229,6 +239,12 @@ impl QueryParser {
let term = Term::from_field_i64(field, val);
Ok(vec![(0, term)])
}
FieldType::Date(_) => {
match chrono::DateTime::parse_from_rfc3339(phrase) {
Ok(x) => Ok(vec![(0, Term::from_field_date(field, &x.with_timezone(&chrono::Utc)))]),
Err(e) => Err(QueryParserError::DateFormatError(e))
}
}
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
let term = Term::from_field_u64(field, val);
@@ -508,6 +524,7 @@ mod test {
schema_builder.add_text_field("notindexed_i64", STORED);
schema_builder.add_text_field("nottokenized", STRING);
schema_builder.add_text_field("with_stop_words", text_options);
schema_builder.add_date_field("date", INDEXED);
let schema = schema_builder.build();
let default_fields = vec![title, text];
let tokenizer_manager = TokenizerManager::default();
@@ -767,6 +784,16 @@ mod test {
);
}
#[test]
pub fn test_query_parser_expected_date() {
let query_parser = make_query_parser();
assert_matches!(
query_parser.parse_query("date:18a"),
Err(QueryParserError::DateFormatError(_))
);
assert!(query_parser.parse_query("date:\"1985-04-12T23:20:50.52Z\"").is_ok());
}
#[test]
pub fn test_query_parser_not_empty_but_no_tokens() {
let query_parser = make_query_parser();

View File

@@ -3,6 +3,7 @@ use common::BinarySerializable;
use common::VInt;
use itertools::Itertools;
use std::io::{self, Read, Write};
use DateTime;
/// Tantivy's Document is the object that can
/// be indexed and then searched for.
@@ -82,11 +83,16 @@ impl Document {
self.add(FieldValue::new(field, Value::U64(value)));
}
/// Add a u64 field
/// Add a i64 field
pub fn add_i64(&mut self, field: Field, value: i64) {
self.add(FieldValue::new(field, Value::I64(value)));
}
/// Add a date field
pub fn add_date(&mut self, field: Field, value: &DateTime) {
self.add(FieldValue::new(field, Value::Date(DateTime::from(*value))));
}
/// Add a bytes field
pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
self.add(FieldValue::new(field, Value::Bytes(value)))

View File

@@ -48,6 +48,15 @@ impl FieldEntry {
}
}
/// Creates a new date field entry in the schema, given
/// a name, and some options.
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
FieldEntry {
name: field_name,
field_type: FieldType::Date(field_type),
}
}
/// Creates a field entry for a facet.
pub fn new_facet(field_name: String) -> FieldEntry {
FieldEntry {
@@ -78,7 +87,7 @@ impl FieldEntry {
pub fn is_indexed(&self) -> bool {
match self.field_type {
FieldType::Str(ref options) => options.get_indexing_options().is_some(),
FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_indexed(),
FieldType::U64(ref options) | FieldType::I64(ref options) | FieldType::Date(ref options) => options.is_indexed(),
FieldType::HierarchicalFacet => true,
FieldType::Bytes => false,
}
@@ -95,7 +104,7 @@ impl FieldEntry {
/// Returns true iff the field is stored
pub fn is_stored(&self) -> bool {
match self.field_type {
FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_stored(),
FieldType::U64(ref options) | FieldType::I64(ref options) | FieldType::Date(ref options) => options.is_stored(),
FieldType::Str(ref options) => options.is_stored(),
// TODO make stored hierarchical facet optional
FieldType::HierarchicalFacet => true,
@@ -125,6 +134,10 @@ impl Serialize for FieldEntry {
s.serialize_field("type", "i64")?;
s.serialize_field("options", options)?;
}
FieldType::Date(ref options) => {
s.serialize_field("type", "date")?;
s.serialize_field("options", options)?;
}
FieldType::HierarchicalFacet => {
s.serialize_field("type", "hierarchical_facet")?;
}
@@ -188,7 +201,7 @@ impl<'de> Deserialize<'de> for FieldEntry {
"bytes" => {
field_type = Some(FieldType::Bytes);
}
"text" | "u64" | "i64" => {
"text" | "u64" | "i64" | "date" => {
// These types require additional options to create a field_type
}
_ => panic!("unhandled type"),
@@ -205,6 +218,7 @@ impl<'de> Deserialize<'de> for FieldEntry {
"text" => field_type = Some(FieldType::Str(map.next_value()?)),
"u64" => field_type = Some(FieldType::U64(map.next_value()?)),
"i64" => field_type = Some(FieldType::I64(map.next_value()?)),
"date" => field_type = Some(FieldType::Date(map.next_value()?)),
_ => {
let msg = format!("Unrecognised type {}", ty);
return Err(de::Error::custom(msg));

View File

@@ -34,6 +34,8 @@ pub enum Type {
U64,
/// `i64`
I64,
/// `date(i64) timestamp`
Date,
/// `tantivy::schema::Facet`. Passed as a string in JSON.
HierarchicalFacet,
/// `Vec<u8>`
@@ -50,6 +52,8 @@ pub enum FieldType {
U64(IntOptions),
/// Signed 64-bits integers 64 field type configuration
I64(IntOptions),
/// Signed 64-bits Date 64 field type configuration,
Date(IntOptions),
/// Hierachical Facet
HierarchicalFacet,
/// Bytes (one per document)
@@ -63,6 +67,7 @@ impl FieldType {
FieldType::Str(_) => Type::Str,
FieldType::U64(_) => Type::U64,
FieldType::I64(_) => Type::I64,
FieldType::Date(_) => Type::Date,
FieldType::HierarchicalFacet => Type::HierarchicalFacet,
FieldType::Bytes => Type::Bytes,
}
@@ -75,6 +80,7 @@ impl FieldType {
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => {
int_options.is_indexed()
}
FieldType::Date(ref date_options) => date_options.is_indexed(),
FieldType::HierarchicalFacet => true,
FieldType::Bytes => false,
}
@@ -89,7 +95,7 @@ impl FieldType {
FieldType::Str(ref text_options) => text_options
.get_indexing_options()
.map(|indexing_options| indexing_options.index_option()),
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => {
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) | FieldType::Date(ref int_options) => {
if int_options.is_indexed() {
Some(IndexRecordOption::Basic)
} else {
@@ -110,7 +116,7 @@ impl FieldType {
match *json {
JsonValue::String(ref field_text) => match *self {
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
FieldType::U64(_) | FieldType::I64(_) => Err(ValueParsingError::TypeError(
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => Err(ValueParsingError::TypeError(
format!("Expected an integer, got {:?}", json),
)),
FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))),
@@ -122,7 +128,7 @@ impl FieldType {
}),
},
JsonValue::Number(ref field_val_num) => match *self {
FieldType::I64(_) => {
FieldType::I64(_) | FieldType::Date(_) => {
if let Some(field_val_i64) = field_val_num.as_i64() {
Ok(Value::I64(field_val_i64))
} else {

View File

@@ -82,6 +82,28 @@ impl SchemaBuilder {
self.add_field(field_entry)
}
/// Adds a new date field.
/// Returns the associated field handle
/// Internally, Tantivy simply stores dates as i64 UTC timestamps,
/// while the user supplies DateTime values for convenience.
///
/// # Caution
///
/// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_date_field<T: Into<IntOptions>>(
&mut self,
field_name_str: &str,
field_options: T
) -> Field {
let field_name = String::from(field_name_str);
let field_entry = FieldEntry::new_date(field_name, field_options.into());
self.add_field(field_entry)
}
/// Adds a new text field.
/// Returns the associated field handle
///

View File

@@ -5,6 +5,7 @@ use byteorder::{BigEndian, ByteOrder};
use common;
use schema::Facet;
use std::str;
use DateTime;
/// Size (in bytes) of the buffer of a int field.
const INT_TERM_LEN: usize = 4 + 8;
@@ -30,6 +31,18 @@ impl Term {
Term::from_field_u64(field, val_u64)
}
/// Builds a term given a field, and a DateTime value
///
/// Assuming the term has a field id of 1, and a timestamp i64 value of 3234,
/// the Term will have 8 bytes.
///
/// The first four byte are dedicated to storing the field id as a u64.
/// The 4 following bytes are encoding the DateTime as i64 timestamp value.
pub fn from_field_date(field: Field, val: &DateTime) -> Term {
let val_timestamp = val.timestamp();
Term::from_field_i64(field, val_timestamp)
}
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_str().as_bytes();

View File

@@ -2,6 +2,7 @@ use schema::Facet;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::fmt;
use DateTime;
/// Value represents the value of a any field.
/// It is an enum over all over all of the possible field type.
@@ -13,6 +14,8 @@ pub enum Value {
U64(u64),
/// Signed 64-bits Integer `i64`
I64(i64),
/// Signed 64-bits Date time stamp `date`
Date(DateTime),
/// Hierarchical Facet
Facet(Facet),
/// Arbitrarily sized byte array
@@ -28,6 +31,7 @@ impl Serialize for Value {
Value::Str(ref v) => serializer.serialize_str(v),
Value::U64(u) => serializer.serialize_u64(u),
Value::I64(u) => serializer.serialize_i64(u),
Value::Date(ref date) => serializer.serialize_i64(date.timestamp()),
Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
}
@@ -102,6 +106,17 @@ impl Value {
_ => panic!("This is not a text field."),
}
}
/// Returns the Date-value, provided the value is of the `Date` type.
///
/// # Panics
/// If the value is not of type `Date`
pub fn date_value(&self) -> &DateTime {
match *self {
Value::Date(ref value) => value,
_ => panic!("This is not a date field."),
}
}
}
impl From<String> for Value {
@@ -122,6 +137,10 @@ impl From<i64> for Value {
}
}
impl From<DateTime> for Value {
fn from(date_time: DateTime) -> Value { Value::Date(date_time) }
}
impl<'a> From<&'a str> for Value {
fn from(s: &'a str) -> Value {
Value::Str(s.to_string())
@@ -145,12 +164,14 @@ mod binary_serialize {
use common::BinarySerializable;
use schema::Facet;
use std::io::{self, Read, Write};
use chrono::{Utc, TimeZone};
const TEXT_CODE: u8 = 0;
const U64_CODE: u8 = 1;
const I64_CODE: u8 = 2;
const HIERARCHICAL_FACET_CODE: u8 = 3;
const BYTES_CODE: u8 = 4;
const DATE_CODE: u8 = 5;
impl BinarySerializable for Value {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
@@ -167,6 +188,10 @@ mod binary_serialize {
I64_CODE.serialize(writer)?;
val.serialize(writer)
}
Value::Date(ref val) => {
DATE_CODE.serialize(writer)?;
val.timestamp().serialize(writer)
}
Value::Facet(ref facet) => {
HIERARCHICAL_FACET_CODE.serialize(writer)?;
facet.serialize(writer)
@@ -192,6 +217,10 @@ mod binary_serialize {
let value = i64::deserialize(reader)?;
Ok(Value::I64(value))
}
DATE_CODE=> {
let timestamp = i64::deserialize(reader)?;
Ok(Value::Date(Utc.timestamp(timestamp, 0)))
}
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
_ => Err(io::Error::new(