add support for float (#603)

* add basic support for float

as for i64, they are mapped to u64 for indexing
query parser don't work yet

* Update value.rs

* implement support for float in query parser

* Update README.md
This commit is contained in:
fdb-hiroshima
2019-07-27 10:57:33 +02:00
committed by Paul Masurel
parent c3231ca252
commit 6eb4e08636
25 changed files with 545 additions and 46 deletions

View File

@@ -1,3 +1,8 @@
Tantivy 0.11.0
=====================
- Added f64 field. Internally reuse u64 code the same way i64 does (@fdb-hiroshima)
Tantivy 0.10.0
=====================

View File

@@ -50,9 +50,9 @@ performance for different type of queries / collection.
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- Mmap directory
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
- Single valued and multivalued u64, i64 and f64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, dates and hierarchical facet fields
- Text, i64, u64, f64, dates and hierarchical facet fields
- LZ4 compressed document store
- Range queries
- Faceted search

View File

@@ -82,6 +82,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
let num_field_f64 = schema_builder.add_f64_field("num_f64", FAST);
let text_field = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
@@ -94,6 +95,7 @@ mod tests {
index_writer.add_document(doc!(
num_field_i64 => ((i as i64) % 3i64) as i64,
num_field_u64 => (i % 2u64) as u64,
num_field_f64 => (i % 4u64) as f64,
text_field => "text"
));
}
@@ -104,10 +106,11 @@ mod tests {
let searcher = index.reader().searcher();
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
let mut ffvf_f64: IntFacetCollector<F64FastFieldReader> = IntFacetCollector::new(num_field_f64);
{
// perform the query
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64).push(&mut ffvf_f64);
let mut query_parser = QueryParser::for_index(index, vec![text_field]);
let query = query_parser.parse_query("text:text").unwrap();
query.search(&searcher, &mut facet_collectors).unwrap();
@@ -117,6 +120,8 @@ mod tests {
assert_eq!(ffvf_u64.counters[&1], 5);
assert_eq!(ffvf_i64.counters[&0], 4);
assert_eq!(ffvf_i64.counters[&1], 3);
assert_eq!(ffvf_f64.counters[&0.0], 3);
assert_eq!(ffvf_f64.counters[&2.0], 2);
}
}

View File

@@ -160,6 +160,7 @@ impl TopDocs {
.fast_fields()
.u64(field)
.expect("Field requested is not a i64/u64 fast field.");
//TODO error message missmatch actual behavior for i64
move |doc: DocId| ff_reader.get(doc)
})
}

View File

@@ -99,15 +99,54 @@ pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
/// Maps a `f64` to `u64`
///
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` so that lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline(always)]
pub fn f64_to_u64(val: f64) -> u64 {
let bits = val.to_bits();
if val.is_sign_positive() {
bits ^ HIGHEST_BIT
} else {
!bits
}
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline(always)]
pub fn u64_to_f64(val: u64) -> f64 {
f64::from_bits(
if val & HIGHEST_BIT != 0 {
val ^ HIGHEST_BIT
} else {
!val
}
)
}
#[cfg(test)]
pub(crate) mod test {
pub use super::serialize::test::fixed_size_test;
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
use super::{compute_num_bits, i64_to_u64, u64_to_i64, f64_to_u64, u64_to_f64};
use std::f64;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
fn test_f64_converter_helper(val: f64) {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
#[test]
fn test_i64_converter() {
@@ -121,6 +160,28 @@ pub(crate) mod test {
}
}
#[test]
fn test_f64_converter() {
test_f64_converter_helper(f64::INFINITY);
test_f64_converter_helper(f64::NEG_INFINITY);
test_f64_converter_helper(0.0);
test_f64_converter_helper(-0.0);
test_f64_converter_helper(1.0);
test_f64_converter_helper(-1.0);
}
#[test]
fn test_f64_order() {
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY)).contains(&f64_to_u64(f64::NAN))); //nan is not a number
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);

View File

@@ -102,6 +102,19 @@ impl FixedSize for i64 {
const SIZE_IN_BYTES: usize = 8;
}
impl BinarySerializable for f64 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_f64::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
reader.read_f64::<Endianness>()
}
}
impl FixedSize for f64 {
const SIZE_IN_BYTES: usize = 8;
}
impl BinarySerializable for u8 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u8(*self)
@@ -172,6 +185,11 @@ pub mod test {
fixed_size_test::<i64>();
}
#[test]
fn test_serialize_f64() {
fixed_size_test::<f64>();
}
#[test]
fn test_serialize_u64() {
fixed_size_test::<u64>();

View File

@@ -48,7 +48,7 @@ mod readers;
mod serializer;
mod writer;
/// Trait for types that are allowed for fast fields: (u64 or i64).
/// Trait for types that are allowed for fast fields: (u64, i64 and f64).
pub trait FastValue: Default + Clone + Copy + Send + Sync + PartialOrd {
/// Converts a value from u64
///
@@ -114,11 +114,33 @@ impl FastValue for i64 {
}
}
impl FastValue for f64 {
fn from_u64(val: u64) -> Self {
common::u64_to_f64(val)
}
fn to_u64(&self) -> u64 {
common::f64_to_u64(*self)
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type {
FieldType::F64(ref integer_options) => integer_options.get_fastfield_cardinality(),
_ => None,
}
}
fn as_u64(&self) -> u64 {
self.to_bits()
}
}
fn value_to_u64(value: &Value) -> u64 {
match *value {
Value::U64(ref val) => *val,
Value::I64(ref val) => common::i64_to_u64(*val),
_ => panic!("Expected a u64/i64 field, got {:?} ", value),
Value::F64(ref val) => common::f64_to_u64(*val),
_ => panic!("Expected a u64/i64/f64 field, got {:?} ", value),
}
}

View File

@@ -14,8 +14,10 @@ use std::collections::HashMap;
pub struct FastFieldReaders {
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
fast_field_f64: HashMap<Field, FastFieldReader<f64>>,
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
fast_field_f64s: HashMap<Field, MultiValueIntFastFieldReader<f64>>,
fast_bytes: HashMap<Field, BytesFastFieldReader>,
fast_fields_composite: CompositeFile,
}
@@ -23,6 +25,7 @@ pub struct FastFieldReaders {
enum FastType {
I64,
U64,
F64,
}
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
@@ -33,6 +36,9 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
FieldType::I64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::I64, cardinality)),
FieldType::F64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::F64, cardinality)),
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
_ => None,
}
@@ -46,8 +52,10 @@ impl FastFieldReaders {
let mut fast_field_readers = FastFieldReaders {
fast_field_i64: Default::default(),
fast_field_u64: Default::default(),
fast_field_f64: Default::default(),
fast_field_i64s: Default::default(),
fast_field_u64s: Default::default(),
fast_field_f64s: Default::default(),
fast_bytes: Default::default(),
fast_fields_composite: fast_fields_composite.clone(),
};
@@ -82,6 +90,12 @@ impl FastFieldReaders {
FastFieldReader::open(fast_field_data.clone()),
);
}
FastType::F64 => {
fast_field_readers.fast_field_f64.insert(
field,
FastFieldReader::open(fast_field_data.clone()),
);
}
}
} else {
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
@@ -109,6 +123,14 @@ impl FastFieldReaders {
.fast_field_u64s
.insert(field, multivalued_int_fast_field);
}
FastType::F64 => {
let vals_reader = FastFieldReader::open(fast_field_data);
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_f64s
.insert(field, multivalued_int_fast_field);
}
}
} else {
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
@@ -135,6 +157,8 @@ impl FastFieldReaders {
/// If the field is a i64-fast field, return the associated u64 reader. Values are
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
///
///TODO should it also be lenient with f64?
///
/// This method is useful when merging segment reader.
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
if let Some(u64_ff_reader) = self.u64(field) {
@@ -153,6 +177,13 @@ impl FastFieldReaders {
self.fast_field_i64.get(&field).cloned()
}
/// Returns the `f64` fast field reader reader associated to `field`.
///
/// If `field` is not a f64 fast field, this method returns `None`.
pub fn f64(&self, field: Field) -> Option<FastFieldReader<f64>> {
self.fast_field_f64.get(&field).cloned()
}
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
@@ -182,6 +213,13 @@ impl FastFieldReaders {
self.fast_field_i64s.get(&field).cloned()
}
/// Returns a `f64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a f64 multi-valued fast field, this method returns `None`.
pub fn f64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<f64>> {
self.fast_field_f64s.get(&field).cloned()
}
/// Returns the `bytes` fast field reader associated to `field`.
///
/// If `field` is not a bytes fast field, returns `None`.

View File

@@ -25,13 +25,13 @@ impl FastFieldsWriter {
for (field_id, field_entry) in schema.fields().iter().enumerate() {
let field = Field(field_id as u32);
let default_value = if let FieldType::I64(_) = *field_entry.field_type() {
common::i64_to_u64(0i64)
} else {
0u64
let default_value = match *field_entry.field_type() {
FieldType::I64(_) => common::i64_to_u64(0i64),
FieldType::F64(_) => common::f64_to_u64(0.0f64),
_ => 0u64,
};
match *field_entry.field_type() {
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => {
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) | FieldType::F64(ref int_options) => {
match int_options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer = IntFastFieldWriter::new(field);
@@ -142,9 +142,9 @@ impl FastFieldsWriter {
/// bitpacked and the number of bits required for bitpacking
/// can only been known once we have seen all of the values.
///
/// Both u64, and i64 use the same writer.
/// i64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64`.
/// Both u64, i64 and f64 use the same writer.
/// i64 and f64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64` and `common::f64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
vals: Vec<u8>,
@@ -203,8 +203,8 @@ impl IntFastFieldWriter {
/// Extract the value associated to the fast field for
/// this document.
///
/// i64 are remapped to u64 using the logic
/// in `common::i64_to_u64`.
/// i64 and f64 are remapped to u64 using the logic
/// in `common::i64_to_u64` and `common::f64_to_u64`.
///
/// If the value is missing, then the default value is used
/// instead.

View File

@@ -207,6 +207,7 @@ impl IndexMerger {
}
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::F64(ref options)
| FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(field, fast_field_serializer)?;

View File

@@ -214,6 +214,17 @@ impl SegmentWriter {
}
}
}
FieldType::F64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_f64(
field_value.field(),
field_value.value().f64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
}
}
}
FieldType::Bytes => {
// Do nothing. Bytes only supports fast fields.
}

View File

@@ -179,7 +179,7 @@ pub use crate::indexer::IndexWriter;
pub use crate::postings::Postings;
pub use crate::schema::{Document, Term};
pub use crate::common::{i64_to_u64, u64_to_i64};
pub use crate::common::{i64_to_u64, u64_to_i64, f64_to_u64, u64_to_f64};
/// Expose the current version of tantivy, as well
/// whether it was compiled with the simd compression.
@@ -625,6 +625,30 @@ mod tests {
assert!(!postings.advance());
}
#[test]
fn test_indexed_f64() {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_f64_field("value", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let term = Term::from_field_f64(value_field, val);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
}
#[test]
fn test_indexedfield_not_in_documents() {
let mut schema_builder = Schema::builder();
@@ -817,6 +841,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
let fast_field_float = schema_builder.add_f64_field("float", FAST);
let text_field = schema_builder.add_text_field("text", TEXT);
let stored_int_field = schema_builder.add_u64_field("text", STORED);
let schema = schema_builder.build();
@@ -824,7 +849,7 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
{
let document = doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64);
let document = doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document);
index_writer.commit().unwrap();
}
@@ -844,10 +869,14 @@ mod tests {
assert!(fast_field_reader_opt.is_none());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_float);
assert!(fast_field_reader_opt.is_none());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
assert!(fast_field_reader_opt.is_some());
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
assert_eq!(fast_field_reader.get(0), 4u64)
}
{
@@ -856,5 +885,12 @@ mod tests {
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
{
let fast_field_reader_opt = segment_reader.fast_fields().f64(fast_field_float);
assert!(fast_field_reader_opt.is_some());
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4f64)
}
}
}

View File

@@ -35,6 +35,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_)
| FieldType::I64(_)
| FieldType::F64(_)
| FieldType::Date(_)
| FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
FieldType::Bytes => {
@@ -154,7 +155,7 @@ impl MultiFieldPostingsWriter {
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {}
FieldType::Bytes => {}
}

View File

@@ -20,7 +20,7 @@ parser! {
parser! {
fn word[I]()(I) -> String
where [I: Stream<Item = char>] {
many1(satisfy(char::is_alphanumeric))
many1(satisfy(|c: char| c.is_alphanumeric() || c=='.'))
.and_then(|s: String| {
match s.as_str() {
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
@@ -266,6 +266,7 @@ mod test {
test_parse_query_to_ast_helper("(+a)", "+(\"a\")");
test_parse_query_to_ast_helper("(+a +b)", "(+(\"a\") +(\"b\"))");
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\"");
test_parse_query_to_ast_helper("+abc:toto", "+(abc:\"toto\")");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+(abc:\"toto\") -(\"titi\"))");
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
@@ -277,6 +278,7 @@ mod test {
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:[\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
test_is_parse_err("abc + ");
}
}

View File

@@ -18,7 +18,7 @@ use crate::schema::{FieldType, Term};
use crate::tokenizer::TokenizerManager;
use combine::Parser;
use std::borrow::Cow;
use std::num::ParseIntError;
use std::num::{ParseIntError, ParseFloatError};
use std::ops::Bound;
use std::str::FromStr;
@@ -30,9 +30,12 @@ pub enum QueryParserError {
/// `FieldDoesNotExist(field_name: String)`
/// The query references a field that is not in the schema
FieldDoesNotExist(String),
/// The query contains a term for a `u64`-field, but the value
/// is not a u64.
/// The query contains a term for a `u64` or `i64`-field, but the value
/// is neither.
ExpectedInt(ParseIntError),
/// The query contains a term for a `f64`-field, but the value
/// is not a f64.
ExpectedFloat(ParseFloatError),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
AllButQueryForbidden,
/// If no default field is declared, running a query without any
@@ -60,6 +63,12 @@ impl From<ParseIntError> for QueryParserError {
}
}
impl From<ParseFloatError> for QueryParserError {
fn from(err: ParseFloatError) -> QueryParserError {
QueryParserError::ExpectedFloat(err)
}
}
impl From<chrono::ParseError> for QueryParserError {
fn from(err: chrono::ParseError) -> QueryParserError {
QueryParserError::DateFormatError(err)
@@ -239,6 +248,11 @@ impl QueryParser {
let term = Term::from_field_i64(field, val);
Ok(vec![(0, term)])
}
FieldType::F64(_) => {
let val: f64 = f64::from_str(phrase)?;
let term = Term::from_field_f64(field, val);
Ok(vec![(0, term)])
}
FieldType::Date(_) => match chrono::DateTime::parse_from_rfc3339(phrase) {
Ok(x) => Ok(vec![(
0,
@@ -529,6 +543,7 @@ mod test {
schema_builder.add_text_field("nottokenized", STRING);
schema_builder.add_text_field("with_stop_words", text_options);
schema_builder.add_date_field("date", INDEXED);
schema_builder.add_f64_field("float", INDEXED);
let schema = schema_builder.build();
let default_fields = vec![title, text];
let tokenizer_manager = TokenizerManager::default();
@@ -634,6 +649,13 @@ mod test {
assert!(query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok());
assert!(query_parser.parse_query("float:\"3.1\"").is_ok());
assert!(query_parser.parse_query("float:\"-2.4\"").is_ok());
assert!(query_parser.parse_query("float:\"2.1.2\"").is_err());
assert!(query_parser.parse_query("float:\"2.1a\"").is_err());
assert!(query_parser
.parse_query("float:\"18446744073709551615.0\"")
.is_ok());
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
@@ -645,6 +667,12 @@ mod test {
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
false,
);
test_parse_query_to_logical_ast_helper(
"float:2.5",
&format!("{:?}", Term::from_field_f64(Field(10u32), 2.5)),
false,
);
}
#[test]
@@ -786,6 +814,11 @@ mod test {
query_parser.parse_query("signed:18b"),
Err(QueryParserError::ExpectedInt(_))
);
assert!(query_parser.parse_query("float:\"1.8\"").is_ok());
assert_matches!(
query_parser.parse_query("float:1.8a"),
Err(QueryParserError::ExpectedFloat(_))
);
}
#[test]

View File

@@ -142,6 +142,39 @@ impl RangeQuery {
}
}
/// Creates a new `RangeQuery` over a `f64` field.
///
/// If the field is not of the type `f64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_f64(field: Field, range: Range<f64>) -> RangeQuery {
RangeQuery::new_f64_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `f64` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `f64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_f64_bounds(
field: Field,
left_bound: Bound<f64>,
right_bound: Bound<f64>,
) -> RangeQuery {
let make_term_val = |val: &f64| Term::from_field_f64(field, *val).value_bytes().to_owned();
RangeQuery {
field,
value_type: Type::F64,
left_bound: map_bound(&left_bound, &make_term_val),
right_bound: map_bound(&right_bound, &make_term_val),
}
}
/// Create a new `RangeQuery` over a `u64` field.
///
/// The two `Bound` arguments make it possible to create more complex
@@ -397,4 +430,61 @@ mod tests {
);
}
#[test]
fn test_range_float() {
let float_field: Field;
let schema = {
let mut schema_builder = Schema::builder();
float_field = schema_builder.add_f64_field("floatfield", INDEXED);
schema_builder.build()
};
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
for i in 1..100 {
let mut doc = Document::new();
for j in 1..100 {
if i % j == 0 {
doc.add_f64(float_field, j as f64);
}
}
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let count_multiples =
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
assert_eq!(count_multiples(RangeQuery::new_f64(float_field, 10.0..11.0)), 9);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
float_field,
Bound::Included(10.0),
Bound::Included(11.0)
)),
18
);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
float_field,
Bound::Excluded(9.0),
Bound::Included(10.0)
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
float_field,
Bound::Included(9.0),
Bound::Unbounded
)),
91
);
}
}

View File

@@ -88,6 +88,11 @@ impl Document {
self.add(FieldValue::new(field, Value::I64(value)));
}
/// Add a f64 field
pub fn add_f64(&mut self, field: Field, value: f64) {
self.add(FieldValue::new(field, Value::F64(value)));
}
/// Add a date field
pub fn add_date(&mut self, field: Field, value: &DateTime) {
self.add(FieldValue::new(field, Value::Date(*value)));

View File

@@ -48,6 +48,15 @@ impl FieldEntry {
}
}
/// Creates a new f64 field entry in the schema, given
/// a name, and some options.
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
FieldEntry {
name: field_name,
field_type: FieldType::F64(field_type),
}
}
/// Creates a new date field entry in the schema, given
/// a name, and some options.
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
@@ -89,6 +98,7 @@ impl FieldEntry {
FieldType::Str(ref options) => options.get_indexing_options().is_some(),
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::F64(ref options)
| FieldType::Date(ref options) => options.is_indexed(),
FieldType::HierarchicalFacet => true,
FieldType::Bytes => false,
@@ -98,7 +108,7 @@ impl FieldEntry {
/// Returns true iff the field is a int (signed or unsigned) fast field
pub fn is_int_fast(&self) -> bool {
match self.field_type {
FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_fast(),
FieldType::U64(ref options) | FieldType::I64(ref options) | FieldType::F64(ref options) => options.is_fast(),
_ => false,
}
}
@@ -108,6 +118,7 @@ impl FieldEntry {
match self.field_type {
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::F64(ref options)
| FieldType::Date(ref options) => options.is_stored(),
FieldType::Str(ref options) => options.is_stored(),
// TODO make stored hierarchical facet optional
@@ -138,6 +149,10 @@ impl Serialize for FieldEntry {
s.serialize_field("type", "i64")?;
s.serialize_field("options", options)?;
}
FieldType::F64(ref options) => {
s.serialize_field("type", "f64")?;
s.serialize_field("options", options)?;
}
FieldType::Date(ref options) => {
s.serialize_field("type", "date")?;
s.serialize_field("options", options)?;
@@ -205,7 +220,7 @@ impl<'de> Deserialize<'de> for FieldEntry {
"bytes" => {
field_type = Some(FieldType::Bytes);
}
"text" | "u64" | "i64" | "date" => {
"text" | "u64" | "i64" | "f64" | "date" => {
// These types require additional options to create a field_type
}
_ => panic!("unhandled type"),
@@ -222,6 +237,7 @@ impl<'de> Deserialize<'de> for FieldEntry {
"text" => field_type = Some(FieldType::Str(map.next_value()?)),
"u64" => field_type = Some(FieldType::U64(map.next_value()?)),
"i64" => field_type = Some(FieldType::I64(map.next_value()?)),
"f64" => field_type = Some(FieldType::F64(map.next_value()?)),
"date" => field_type = Some(FieldType::Date(map.next_value()?)),
_ => {
let msg = format!("Unrecognised type {}", ty);

View File

@@ -35,6 +35,8 @@ pub enum Type {
U64,
/// `i64`
I64,
/// `f64`
F64,
/// `date(i64) timestamp`
Date,
/// `tantivy::schema::Facet`. Passed as a string in JSON.
@@ -53,6 +55,8 @@ pub enum FieldType {
U64(IntOptions),
/// Signed 64-bits integers 64 field type configuration
I64(IntOptions),
/// 64-bits float 64 field type configuration
F64(IntOptions),
/// Signed 64-bits Date 64 field type configuration,
Date(IntOptions),
/// Hierachical Facet
@@ -68,6 +72,7 @@ impl FieldType {
FieldType::Str(_) => Type::Str,
FieldType::U64(_) => Type::U64,
FieldType::I64(_) => Type::I64,
FieldType::F64(_) => Type::F64,
FieldType::Date(_) => Type::Date,
FieldType::HierarchicalFacet => Type::HierarchicalFacet,
FieldType::Bytes => Type::Bytes,
@@ -78,7 +83,7 @@ impl FieldType {
pub fn is_indexed(&self) -> bool {
match *self {
FieldType::Str(ref text_options) => text_options.get_indexing_options().is_some(),
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => {
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) => {
int_options.is_indexed()
}
FieldType::Date(ref date_options) => date_options.is_indexed(),
@@ -98,6 +103,7 @@ impl FieldType {
.map(TextFieldIndexing::index_option),
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Date(ref int_options) => {
if int_options.is_indexed() {
Some(IndexRecordOption::Basic)
@@ -119,7 +125,7 @@ impl FieldType {
match *json {
JsonValue::String(ref field_text) => match *self {
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => Err(
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => Err(
ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)),
),
FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))),
@@ -146,6 +152,14 @@ impl FieldType {
let msg = format!("Expected a u64 int, got {:?}", json);
Err(ValueParsingError::OverflowError(msg))
}
},
FieldType::F64(_) => {
if let Some(field_val_f64) = field_val_num.as_f64() {
Ok(Value::F64(field_val_f64))
} else {
let msg = format!("Expected a f64 int, got {:?}", json);
Err(ValueParsingError::OverflowError(msg))
}
}
FieldType::Str(_) | FieldType::HierarchicalFacet | FieldType::Bytes => {
let msg = format!("Expected a string, got {:?}", json);

View File

@@ -22,7 +22,7 @@ pub const STORED: SchemaFlagList<StoredFlag, ()> = SchemaFlagList {
pub struct IndexedFlag;
/// Flag to mark the field as indexed.
///
/// The `INDEXED` flag can only be used when building `IntOptions` (`u64` and `i64` fields)
/// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
/// Of course, text fields can also be indexed... But this is expressed by using either the
/// `STRING` (untokenized) or `TEXT` (tokenized with the english tokenizer) flags.
pub const INDEXED: SchemaFlagList<IndexedFlag, ()> = SchemaFlagList {
@@ -36,7 +36,7 @@ pub struct FastFlag;
///
/// Fast fields can be random-accessed rapidly. Fields useful for scoring, filtering
/// or collection should be mark as fast fields.
/// The `FAST` flag can only be used when building `IntOptions` (`u64` and `i64` fields)
/// The `FAST` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
pub const FAST: SchemaFlagList<FastFlag, ()> = SchemaFlagList {
head: FastFlag,
tail: (),

View File

@@ -54,7 +54,7 @@ On the other hand setting the field as stored or not determines whether the fiel
when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called.
## Setting a u64 or a i64 field
## Setting a u64, a i64 or a f64 field
### Example

View File

@@ -82,6 +82,26 @@ impl SchemaBuilder {
self.add_field(field_entry)
}
/// Adds a new f64 field.
/// Returns the associated field handle
///
/// # Caution
///
/// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_f64_field<T: Into<IntOptions>>(
&mut self,
field_name_str: &str,
field_options: T,
) -> Field {
let field_name = String::from(field_name_str);
let field_entry = FieldEntry::new_f64(field_name, field_options.into());
self.add_field(field_entry)
}
/// Adds a new date field.
/// Returns the associated field handle
/// Internally, Tantivy simply stores dates as i64 UTC timestamps,
@@ -376,10 +396,14 @@ mod tests {
let popularity_options = IntOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let score_options = IntOptions::default()
.set_indexed()
.set_fast(Cardinality::SingleValue);
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u64_field("count", count_options);
schema_builder.add_i64_field("popularity", popularity_options);
schema_builder.add_f64_field("score", score_options);
let schema = schema_builder.build();
let schema_json = serde_json::to_string_pretty(&schema).unwrap();
let expected = r#"[
@@ -422,6 +446,15 @@ mod tests {
"fast": "single",
"stored": true
}
},
{
"name": "score",
"type": "f64",
"options": {
"indexed": true,
"fast": "single",
"stored": false
}
}
]"#;
assert_eq!(schema_json, expected);
@@ -434,6 +467,8 @@ mod tests {
assert_eq!("author", fields.next().unwrap().name());
assert_eq!("count", fields.next().unwrap().name());
assert_eq!("popularity", fields.next().unwrap().name());
assert_eq!("score", fields.next().unwrap().name());
assert!(fields.next().is_none());
}
#[test]
@@ -466,10 +501,14 @@ mod tests {
let popularity_options = IntOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let score_options = IntOptions::default()
.set_indexed()
.set_fast(Cardinality::SingleValue);
let title_field = schema_builder.add_text_field("title", TEXT);
let author_field = schema_builder.add_text_field("author", STRING);
let count_field = schema_builder.add_u64_field("count", count_options);
let popularity_field = schema_builder.add_i64_field("popularity", popularity_options);
let score_field = schema_builder.add_f64_field("score", score_options);
let schema = schema_builder.build();
{
let doc = schema.parse_document("{}").unwrap();
@@ -482,7 +521,8 @@ mod tests {
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10
"popularity": 10,
"score": 80.5
}"#,
)
.unwrap();
@@ -493,6 +533,7 @@ mod tests {
);
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
assert_eq!(doc.get_first(score_field).unwrap().f64_value(), 80.5);
}
{
let json_err = schema.parse_document(
@@ -501,6 +542,7 @@ mod tests {
"author": "fulmicoton",
"count": 4,
"popularity": 10,
"score": 80.5,
"jambon": "bayonne"
}"#,
);
@@ -513,6 +555,7 @@ mod tests {
"author": "fulmicoton",
"count": "5",
"popularity": "10",
"score": "80.5",
"jambon": "bayonne"
}"#,
);
@@ -527,7 +570,8 @@ mod tests {
"title": "my title",
"author": "fulmicoton",
"count": -5,
"popularity": 10
"popularity": 10,
"score": 80.5
}"#,
);
assert_matches!(
@@ -541,7 +585,8 @@ mod tests {
"title": "my title",
"author": "fulmicoton",
"count": 9223372036854775808,
"popularity": 10
"popularity": 10,
"score": 80.5
}"#,
);
assert!(!matches!(
@@ -555,7 +600,8 @@ mod tests {
"title": "my title",
"author": "fulmicoton",
"count": 50,
"popularity": 9223372036854775808
"popularity": 9223372036854775808,
"score": 80.5
}"#,
);
assert_matches!(

View File

@@ -19,9 +19,9 @@ where
B: AsRef<[u8]>;
impl Term {
/// Builds a term given a field, and a u64-value
/// Builds a term given a field, and a i64-value
///
/// Assuming the term has a field id of 1, and a u64 value of 3234,
/// Assuming the term has a field id of 1, and a i64 value of 3234,
/// the Term will have 8 bytes.
///
/// The first four byte are dedicated to storing the field id as a u64.
@@ -31,6 +31,18 @@ impl Term {
Term::from_field_u64(field, val_u64)
}
/// Builds a term given a field, and a f64-value
///
/// Assuming the term has a field id of 1, and a u64 value of 3234,
/// the Term will have 8 bytes. <= this is wrong
///
/// The first four byte are dedicated to storing the field id as a u64.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_f64(field: Field, val: f64) -> Term {
let val_u64: u64 = common::f64_to_u64(val);
Term::from_field_u64(field, val_u64)
}
/// Builds a term given a field, and a DateTime value
///
/// Assuming the term has a field id of 1, and a timestamp i64 value of 3234,
@@ -112,6 +124,11 @@ impl Term {
self.set_u64(common::i64_to_u64(val));
}
/// Sets a `f64` value in the term.
pub fn set_f64(&mut self, val: f64) {
self.set_u64(common::f64_to_u64(val));
}
fn set_bytes(&mut self, bytes: &[u8]) {
self.0.resize(4, 0u8);
self.0.extend(bytes);
@@ -161,6 +178,15 @@ where
common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..]))
}
/// Returns the `f64` value stored in a term.
///
/// # Panics
/// ... or returns an invalid value
/// if the term is not a `i64` field.
pub fn get_f64(&self) -> f64 {
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..]))
}
/// Returns the text associated with the term.
///
/// # Panics

View File

@@ -2,11 +2,11 @@ use crate::schema::Facet;
use crate::DateTime;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::fmt;
use std::{fmt, cmp::Ordering};
/// Value represents the value of a any field.
/// It is an enum over all over all of the possible field type.
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
#[derive(Debug, Clone, PartialEq, PartialOrd)]
pub enum Value {
/// The str type is used for any text information.
Str(String),
@@ -14,6 +14,8 @@ pub enum Value {
U64(u64),
/// Signed 64-bits Integer `i64`
I64(i64),
/// 64-bits Float `f64`
F64(f64),
/// Signed 64-bits Date time stamp `date`
Date(DateTime),
/// Hierarchical Facet
@@ -22,6 +24,40 @@ pub enum Value {
Bytes(Vec<u8>),
}
impl Eq for Value {}
impl Ord for Value {
fn cmp(&self, other: &Self) -> Ordering {
match (self,other) {
(Value::Str(l), Value::Str(r)) => l.cmp(r),
(Value::U64(l), Value::U64(r)) => l.cmp(r),
(Value::I64(l), Value::I64(r)) => l.cmp(r),
(Value::Date(l), Value::Date(r)) => l.cmp(r),
(Value::Facet(l), Value::Facet(r)) => l.cmp(r),
(Value::Bytes(l), Value::Bytes(r)) => l.cmp(r),
(Value::F64(l), Value::F64(r)) => {
match (l.is_nan(),r.is_nan()) {
(false, false) => l.partial_cmp(r).unwrap(), // only fail on NaN
(true, true) => Ordering::Equal,
(true, false) => Ordering::Less, // we define NaN as less than -∞
(false, true) => Ordering::Greater,
}
}
(Value::Str(_), _) => Ordering::Less,
(_, Value::Str(_)) => Ordering::Greater,
(Value::U64(_), _) => Ordering::Less,
(_, Value::U64(_)) => Ordering::Greater,
(Value::I64(_), _) => Ordering::Less,
(_, Value::I64(_)) => Ordering::Greater,
(Value::F64(_), _) => Ordering::Less,
(_, Value::F64(_)) => Ordering::Greater,
(Value::Date(_), _) => Ordering::Less,
(_, Value::Date(_)) => Ordering::Greater,
(Value::Facet(_), _) => Ordering::Less,
(_, Value::Facet(_)) => Ordering::Greater,
}
}
}
impl Serialize for Value {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
@@ -31,6 +67,7 @@ impl Serialize for Value {
Value::Str(ref v) => serializer.serialize_str(v),
Value::U64(u) => serializer.serialize_u64(u),
Value::I64(u) => serializer.serialize_i64(u),
Value::F64(u) => serializer.serialize_f64(u),
Value::Date(ref date) => serializer.serialize_i64(date.timestamp()),
Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
@@ -60,6 +97,10 @@ impl<'de> Deserialize<'de> for Value {
Ok(Value::I64(v))
}
fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E> {
Ok(Value::F64(v))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
Ok(Value::Str(v.to_owned()))
}
@@ -75,9 +116,7 @@ impl<'de> Deserialize<'de> for Value {
impl Value {
/// Returns the text value, provided the value is of the `Str` type.
///
/// # Panics
/// If the value is not of type `Str`
/// (Returns None if the value is not of the `Str` type).
pub fn text(&self) -> Option<&str> {
match *self {
Value::Str(ref text) => Some(text),
@@ -92,7 +131,7 @@ impl Value {
pub fn u64_value(&self) -> u64 {
match *self {
Value::U64(ref value) => *value,
_ => panic!("This is not a text field."),
_ => panic!("This is not a u64 field."),
}
}
@@ -103,10 +142,21 @@ impl Value {
pub fn i64_value(&self) -> i64 {
match *self {
Value::I64(ref value) => *value,
_ => panic!("This is not a text field."),
_ => panic!("This is not a i64 field."),
}
}
/// Returns the f64-value, provided the value is of the `F64` type.
///
/// # Panics
/// If the value is not of type `F64`
pub fn f64_value(&self) -> f64 {
match *self {
Value::F64(ref value) => *value,
_ => panic!("This is not a f64 field."),
}
}
/// Returns the Date-value, provided the value is of the `Date` type.
///
/// # Panics
@@ -137,6 +187,12 @@ impl From<i64> for Value {
}
}
impl From<f64> for Value {
fn from(v: f64) -> Value {
Value::F64(v)
}
}
impl From<DateTime> for Value {
fn from(date_time: DateTime) -> Value {
Value::Date(date_time)
@@ -163,7 +219,7 @@ impl From<Vec<u8>> for Value {
mod binary_serialize {
use super::Value;
use crate::common::BinarySerializable;
use crate::common::{BinarySerializable, f64_to_u64, u64_to_f64};
use crate::schema::Facet;
use chrono::{TimeZone, Utc};
use std::io::{self, Read, Write};
@@ -174,6 +230,7 @@ mod binary_serialize {
const HIERARCHICAL_FACET_CODE: u8 = 3;
const BYTES_CODE: u8 = 4;
const DATE_CODE: u8 = 5;
const F64_CODE: u8 = 6;
impl BinarySerializable for Value {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
@@ -190,6 +247,10 @@ mod binary_serialize {
I64_CODE.serialize(writer)?;
val.serialize(writer)
}
Value::F64(ref val) => {
F64_CODE.serialize(writer)?;
f64_to_u64(*val).serialize(writer)
}
Value::Date(ref val) => {
DATE_CODE.serialize(writer)?;
val.timestamp().serialize(writer)
@@ -219,6 +280,10 @@ mod binary_serialize {
let value = i64::deserialize(reader)?;
Ok(Value::I64(value))
}
F64_CODE => {
let value = u64_to_f64(u64::deserialize(reader)?);
Ok(Value::F64(value))
}
DATE_CODE => {
let timestamp = i64::deserialize(reader)?;
Ok(Value::Date(Utc.timestamp(timestamp, 0)))

View File

@@ -14,6 +14,9 @@ lexicographical order matches the natural order of integers.
`i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()`
and then treated as a `u64`.
`f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
as `u64`.
A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html).
*/