From 74f9eafefccc1524cc8e413b0f90dd9f23a53a16 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 20 Apr 2023 21:31:43 +0800 Subject: [PATCH] refactor Term (#2006) * refactor Term add ValueBytes for serialized term values add missing debug for ip skip unnecessary json path validation remove code duplication add DATE_TIME_PRECISION_INDEXED constant add missing Term clarification remove weird value_bytes_mut() API * fix naming --- bitpacker/src/filter_vec/avx2.rs | 2 +- src/core/inverted_index_reader.rs | 4 +- src/core/json_utils.rs | 47 ++- src/fastfield/mod.rs | 2 +- src/indexer/segment_writer.rs | 67 +++- src/postings/json_postings_writer.rs | 5 +- src/postings/postings_writer.rs | 6 +- src/query/fuzzy_query.rs | 3 +- .../phrase_prefix_query.rs | 17 +- .../phrase_prefix_weight.rs | 7 +- src/query/query_parser/query_parser.rs | 112 +++--- src/query/range_query/range_query.rs | 12 +- src/query/set_query.rs | 8 +- src/query/term_query/mod.rs | 2 +- src/schema/date_time_options.rs | 6 +- src/schema/mod.rs | 4 +- src/schema/term.rs | 360 +++++++++++------- src/snippet/mod.rs | 3 +- 18 files changed, 394 insertions(+), 273 deletions(-) diff --git a/bitpacker/src/filter_vec/avx2.rs b/bitpacker/src/filter_vec/avx2.rs index fa1400d2b..1a3edb21d 100644 --- a/bitpacker/src/filter_vec/avx2.rs +++ b/bitpacker/src/filter_vec/avx2.rs @@ -1,5 +1,5 @@ //! SIMD filtering of a vector as described in the following blog post. -//! https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512 +//! use std::arch::x86_64::{ __m256i as DataType, _mm256_add_epi32 as op_add, _mm256_cmpgt_epi32 as op_greater, _mm256_lddqu_si256 as load_unaligned, _mm256_or_si256 as op_or, _mm256_set1_epi32 as set1, diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index d9fe31266..87773c1f7 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -61,7 +61,7 @@ impl InvertedIndexReader { /// Returns the term info associated with the term. pub fn get_term_info(&self, term: &Term) -> io::Result> { - self.termdict.get(term.value_bytes()) + self.termdict.get(term.serialized_value_bytes()) } /// Return the term dictionary datastructure. @@ -203,7 +203,7 @@ impl InvertedIndexReader { #[cfg(feature = "quickwit")] impl InvertedIndexReader { pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result> { - self.termdict.get_async(term.value_bytes()).await + self.termdict.get_async(term.serialized_value_bytes()).await } /// Returns a block postings given a `Term`. diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 7f3cb2c8f..9432bbc46 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -5,12 +5,12 @@ use rustc_hash::FxHashMap; use crate::fastfield::FastValue; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; -use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR}; -use crate::schema::{Field, Type}; +use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR}; +use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; use crate::tokenizer::TextAnalyzer; -use crate::{DatePrecision, DateTime, DocId, Term}; +use crate::{DateTime, DocId, Term}; /// This object is a map storing the last position for a given path for the current document /// being indexed. @@ -59,7 +59,7 @@ struct IndexingPositionsPerPath { impl IndexingPositionsPerPath { fn get_position(&mut self, term: &Term) -> &mut IndexingPosition { self.positions_per_path - .entry(murmurhash2(term.as_slice())) + .entry(murmurhash2(term.serialized_term())) .or_insert_with(Default::default) } } @@ -257,6 +257,9 @@ pub(crate) fn set_string_and_get_terms( positions_and_terms } +/// Writes a value of a JSON field to a `Term`. +/// The Term format is as follows: +/// [JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES] pub struct JsonTermWriter<'a> { term_buffer: &'a mut Term, path_stack: Vec, @@ -355,27 +358,23 @@ impl<'a> JsonTermWriter<'a> { pub fn close_path_and_set_type(&mut self, typ: Type) { self.trim_to_end_of_path(); - let buffer = self.term_buffer.value_bytes_mut(); - let buffer_len = buffer.len(); - buffer[buffer_len - 1] = JSON_END_OF_PATH; + self.term_buffer.set_json_path_end(); self.term_buffer.append_bytes(&[typ.to_code()]); } pub fn push_path_segment(&mut self, segment: &str) { // the path stack should never be empty. self.trim_to_end_of_path(); - let buffer = self.term_buffer.value_bytes_mut(); - let buffer_len = buffer.len(); if self.path_stack.len() > 1 { - buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP; + self.term_buffer.set_json_path_separator(); } let appended_segment = self.term_buffer.append_bytes(segment.as_bytes()); if self.expand_dots_enabled { // We need to replace `.` by JSON_PATH_SEGMENT_SEP. replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment); } - self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP); + self.term_buffer.add_json_path_separator(); self.path_stack.push(self.term_buffer.len_bytes()); } @@ -389,14 +388,14 @@ impl<'a> JsonTermWriter<'a> { #[cfg(test)] pub(crate) fn path(&self) -> &[u8] { let end_of_path = self.path_stack.last().cloned().unwrap_or(1); - &self.term().value_bytes()[..end_of_path - 1] + &self.term().serialized_value_bytes()[..end_of_path - 1] } pub(crate) fn set_fast_value(&mut self, val: T) { self.close_path_and_set_type(T::to_type()); let value = if T::to_type() == Type::Date { DateTime::from_u64(val.to_u64()) - .truncate(DatePrecision::Seconds) + .truncate(DATE_TIME_PRECISION_INDEXED) .to_u64() } else { val.to_u64() @@ -431,12 +430,12 @@ mod tests { json_writer.set_str("red"); assert_eq!( format!("{:?}", json_writer.term()), - "Term(type=Json, field=1, path=attributes.color, vtype=Str, \"red\")" + "Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")" ); json_writer.set_str("blue"); assert_eq!( format!("{:?}", json_writer.term()), - "Term(type=Json, field=1, path=attributes.color, vtype=Str, \"blue\")" + "Term(field=1, type=Json, path=attributes.color, type=Str, \"blue\")" ); json_writer.pop_path_segment(); json_writer.push_path_segment("dimensions"); @@ -444,14 +443,14 @@ mod tests { json_writer.set_fast_value(400i64); assert_eq!( format!("{:?}", json_writer.term()), - "Term(type=Json, field=1, path=attributes.dimensions.width, vtype=I64, 400)" + "Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)" ); json_writer.pop_path_segment(); json_writer.push_path_segment("height"); json_writer.set_fast_value(300i64); assert_eq!( format!("{:?}", json_writer.term()), - "Term(type=Json, field=1, path=attributes.dimensions.height, vtype=I64, 300)" + "Term(field=1, type=Json, path=attributes.dimensions.height, type=I64, 300)" ); } @@ -463,7 +462,7 @@ mod tests { json_writer.push_path_segment("color"); json_writer.set_str("red"); assert_eq!( - json_writer.term().as_slice(), + json_writer.term().serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred" ) } @@ -476,7 +475,7 @@ mod tests { json_writer.push_path_segment("color"); json_writer.set_fast_value(-4i64); assert_eq!( - json_writer.term().as_slice(), + json_writer.term().serialized_term(), b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc" ) } @@ -489,7 +488,7 @@ mod tests { json_writer.push_path_segment("color"); json_writer.set_fast_value(4u64); assert_eq!( - json_writer.term().as_slice(), + json_writer.term().serialized_term(), b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04" ) } @@ -502,7 +501,7 @@ mod tests { json_writer.push_path_segment("color"); json_writer.set_fast_value(4.0f64); assert_eq!( - json_writer.term().as_slice(), + json_writer.term().serialized_term(), b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00" ) } @@ -515,7 +514,7 @@ mod tests { json_writer.push_path_segment("color"); json_writer.set_fast_value(true); assert_eq!( - json_writer.term().as_slice(), + json_writer.term().serialized_term(), b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01" ) } @@ -530,7 +529,7 @@ mod tests { json_writer.push_path_segment("color"); json_writer.set_str("red"); assert_eq!( - json_writer.term().as_slice(), + json_writer.term().serialized_term(), b"\x00\x00\x00\x01jattribute\x01color\x00sred" ) } @@ -545,7 +544,7 @@ mod tests { json_writer.pop_path_segment(); json_writer.set_str("red"); assert_eq!( - json_writer.term().as_slice(), + json_writer.term().serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred" ) } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 93d16d1bb..32fff6ec3 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -14,7 +14,7 @@ //! Fields have to be declared as `FAST` in the schema. //! Currently supported fields are: u64, i64, f64, bytes, ip and text. //! -//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected +//! Fast fields are stored in with [different codecs](columnar). The best codec is detected //! automatically, when serializing. //! //! Read access performance is comparable to that of an array lookup. diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 82e786b48..1c8cd0ce6 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -12,10 +12,10 @@ use crate::postings::{ compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition, PerFieldPostingsWriter, PostingsWriter, }; -use crate::schema::{FieldEntry, FieldType, Schema, Term, Value}; +use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED}; use crate::store::{StoreReader, StoreWriter}; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer}; -use crate::{DatePrecision, DocId, Document, Opstamp, SegmentComponent}; +use crate::{DocId, Document, Opstamp, SegmentComponent}; /// Computes the initial size of the hash table. /// @@ -246,7 +246,8 @@ impl SegmentWriter { for value in values { num_vals += 1; let date_val = value.as_date().ok_or_else(make_schema_error)?; - term_buffer.set_u64(date_val.truncate(DatePrecision::Seconds).to_u64()); + term_buffer + .set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64()); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } if field_entry.has_fieldnorms() { @@ -551,14 +552,20 @@ mod tests { json_term_writer.push_path_segment("bool"); json_term_writer.set_fast_value(true); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.pop_path_segment(); json_term_writer.push_path_segment("complexobject"); json_term_writer.push_path_segment("field.with.dot"); json_term_writer.set_fast_value(1i64); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.pop_path_segment(); json_term_writer.pop_path_segment(); @@ -567,55 +574,85 @@ mod tests { OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(), )); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.pop_path_segment(); json_term_writer.push_path_segment("float"); json_term_writer.set_fast_value(-0.2f64); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.pop_path_segment(); json_term_writer.push_path_segment("my_arr"); json_term_writer.set_fast_value(2i64); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.set_fast_value(3i64); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.set_fast_value(4i64); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.push_path_segment("my_key"); json_term_writer.set_str("tokens"); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.set_str("two"); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.pop_path_segment(); json_term_writer.pop_path_segment(); json_term_writer.push_path_segment("signed"); json_term_writer.set_fast_value(-2i64); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.pop_path_segment(); json_term_writer.push_path_segment("toto"); json_term_writer.set_str("titi"); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); json_term_writer.pop_path_segment(); json_term_writer.push_path_segment("unsigned"); json_term_writer.set_fast_value(1i64); assert!(term_stream.advance()); - assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert_eq!( + term_stream.key(), + json_term_writer.term().serialized_value_bytes() + ); assert!(!term_stream.advance()); } diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs index 11315955a..05c9e3d95 100644 --- a/src/postings/json_postings_writer.rs +++ b/src/postings/json_postings_writer.rs @@ -6,7 +6,6 @@ use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::postings_writer::SpecializedPostingsWriter; use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder}; use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter}; -use crate::schema::term::as_json_path_type_value_bytes; use crate::schema::Type; use crate::tokenizer::TokenStream; use crate::{DocId, Term}; @@ -61,8 +60,8 @@ impl PostingsWriter for JsonPostingsWriter { ) -> io::Result<()> { let mut buffer_lender = BufferLender::default(); for (term, addr) in term_addrs { - // TODO optimization opportunity here. - if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) { + if let Some(json_value) = term.value().as_json_value_bytes() { + let typ = json_value.typ(); if typ == Type::Str { SpecializedPostingsWriter::::serialize_one_term( term, diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 611e62dfe..8233de37a 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -171,7 +171,7 @@ impl SpecializedPostingsWriter { ) -> io::Result<()> { let recorder: Rec = ctx.term_index.read(addr); let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); - serializer.new_term(term.value_bytes(), term_doc_freq)?; + serializer.new_term(term.serialized_value_bytes(), term_doc_freq)?; recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender); serializer.close_term()?; Ok(()) @@ -180,10 +180,10 @@ impl SpecializedPostingsWriter { impl PostingsWriter for SpecializedPostingsWriter { fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) { - debug_assert!(term.as_slice().len() >= 4); + debug_assert!(term.serialized_term().len() >= 4); self.total_num_tokens += 1; let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena); - term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option| { + term_index.mutate_or_create(term.serialized_term(), |opt_recorder: Option| { if let Some(mut recorder) = opt_recorder { let current_doc = recorder.current_doc(); if current_doc != doc { diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index e61201bb8..1c6b1f479 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -131,7 +131,8 @@ impl FuzzyTermQuery { LevenshteinAutomatonBuilder::new(self.distance, self.transposition_cost_one) }); - let term_text = self.term.as_str().ok_or_else(|| { + let term_value = self.term.value(); + let term_text = term_value.as_str().ok_or_else(|| { InvalidArgument("The fuzzy term query requires a string term.".to_string()) })?; let automaton = if self.prefix { diff --git a/src/query/phrase_prefix_query/phrase_prefix_query.rs b/src/query/phrase_prefix_query/phrase_prefix_query.rs index 0c24c9312..4c0e7b9f3 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_query.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_query.rs @@ -138,14 +138,15 @@ impl Query for PhrasePrefixQuery { Ok(Box::new(phrase_weight)) } else { // There are no prefix. Let's just match the suffix. - let end_term = if let Some(end_value) = prefix_end(self.prefix.1.value_bytes()) { - let mut end_term = Term::with_capacity(end_value.len()); - end_term.set_field_and_type(self.field, self.prefix.1.typ()); - end_term.append_bytes(&end_value); - Bound::Excluded(end_term) - } else { - Bound::Unbounded - }; + let end_term = + if let Some(end_value) = prefix_end(self.prefix.1.serialized_value_bytes()) { + let mut end_term = Term::with_capacity(end_value.len()); + end_term.set_field_and_type(self.field, self.prefix.1.typ()); + end_term.append_bytes(&end_value); + Bound::Excluded(end_term) + } else { + Bound::Unbounded + }; let mut range_query = RangeQuery::new_term_bounds( enable_scoring diff --git a/src/query/phrase_prefix_query/phrase_prefix_weight.rs b/src/query/phrase_prefix_query/phrase_prefix_weight.rs index acf964468..2eb99506b 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_weight.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_weight.rs @@ -78,8 +78,11 @@ impl PhrasePrefixWeight { } let inv_index = reader.inverted_index(self.prefix.1.field())?; - let mut stream = inv_index.terms().range().ge(self.prefix.1.value_bytes()); - if let Some(end) = prefix_end(self.prefix.1.value_bytes()) { + let mut stream = inv_index + .terms() + .range() + .ge(self.prefix.1.serialized_value_bytes()); + if let Some(end) = prefix_end(self.prefix.1.serialized_value_bytes()) { stream = stream.lt(&end); } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 57939fdad..a3ca5f2ca 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -952,7 +952,7 @@ mod test { let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap(); assert_eq!( format!("{:?}", query), - r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"# + r#"TermQuery(Term(field=11, type=Facet, Facet(/root/branch/leaf)))"# ); } @@ -965,7 +965,7 @@ mod test { let query = query_parser.parse_query("text:hello").unwrap(); assert_eq!( format!("{:?}", query), - r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"# + r#"Boost(query=TermQuery(Term(field=1, type=Str, "hello")), boost=2)"# ); } @@ -988,7 +988,7 @@ mod test { let query = query_parser.parse_query("text:hello^2").unwrap(); assert_eq!( format!("{:?}", query), - r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"# + r#"Boost(query=Boost(query=TermQuery(Term(field=1, type=Str, "hello")), boost=2), boost=2)"# ); } @@ -1027,7 +1027,7 @@ mod test { pub fn test_parse_query_untokenized() { test_parse_query_to_logical_ast_helper( "nottokenized:\"wordone wordtwo\"", - r#"Term(type=Str, field=7, "wordone wordtwo")"#, + r#"Term(field=7, type=Str, "wordone wordtwo")"#, false, ); } @@ -1070,7 +1070,7 @@ mod test { .is_ok()); test_parse_query_to_logical_ast_helper( "unsigned:2324", - "Term(type=U64, field=3, 2324)", + "Term(field=3, type=U64, 2324)", false, ); @@ -1097,7 +1097,7 @@ mod test { fn test_parse_bytes() { test_parse_query_to_logical_ast_helper( "bytes:YnVidQ==", - "Term(type=Bytes, field=12, [98, 117, 98, 117])", + "Term(field=12, type=Bytes, [98, 117, 98, 117])", false, ); } @@ -1124,7 +1124,7 @@ mod test { fn test_json_field() { test_parse_query_to_logical_ast_helper( "json.titi:hello", - "Term(type=Json, field=14, path=titi, vtype=Str, \"hello\")", + "Term(field=14, type=Json, path=titi, type=Str, \"hello\")", false, ); } @@ -1136,7 +1136,9 @@ mod test { let LogicalLiteral::Term(term) = *literal else { panic!(); }; - std::str::from_utf8(term.value_bytes()).unwrap().to_string() + std::str::from_utf8(term.serialized_value_bytes()) + .unwrap() + .to_string() } #[test] @@ -1155,17 +1157,17 @@ mod test { fn test_json_field_possibly_a_number() { test_parse_query_to_logical_ast_helper( "json.titi:5", - r#"(Term(type=Json, field=14, path=titi, vtype=U64, 5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#, + r#"(Term(field=14, type=Json, path=titi, type=U64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#, true, ); test_parse_query_to_logical_ast_helper( "json.titi:-5", - r#"(Term(type=Json, field=14, path=titi, vtype=I64, -5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-". + r#"(Term(field=14, type=Json, path=titi, type=I64, -5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-". true, ); test_parse_query_to_logical_ast_helper( "json.titi:-5.2", - r#"(Term(type=Json, field=14, path=titi, vtype=F64, -5.2) "[(0, Term(type=Json, field=14, path=titi, vtype=Str, "5")), (1, Term(type=Json, field=14, path=titi, vtype=Str, "2"))]")"#, + r#"(Term(field=14, type=Json, path=titi, type=F64, -5.2) "[(0, Term(field=14, type=Json, path=titi, type=Str, "5")), (1, Term(field=14, type=Json, path=titi, type=Str, "2"))]")"#, true, ); } @@ -1174,7 +1176,7 @@ mod test { fn test_json_field_possibly_a_date() { test_parse_query_to_logical_ast_helper( r#"json.date:"2019-10-12T07:20:50.52Z""#, - r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#, + r#"(Term(field=14, type=Json, path=date, type=Date, 2019-10-12T07:20:50Z) "[(0, Term(field=14, type=Json, path=date, type=Str, "2019")), (1, Term(field=14, type=Json, path=date, type=Str, "10")), (2, Term(field=14, type=Json, path=date, type=Str, "12t07")), (3, Term(field=14, type=Json, path=date, type=Str, "20")), (4, Term(field=14, type=Json, path=date, type=Str, "50")), (5, Term(field=14, type=Json, path=date, type=Str, "52z"))]")"#, true, ); } @@ -1183,7 +1185,7 @@ mod test { fn test_json_field_possibly_a_bool() { test_parse_query_to_logical_ast_helper( "json.titi:true", - r#"(Term(type=Json, field=14, path=titi, vtype=Bool, true) Term(type=Json, field=14, path=titi, vtype=Str, "true"))"#, + r#"(Term(field=14, type=Json, path=titi, type=Bool, true) Term(field=14, type=Json, path=titi, type=Str, "true"))"#, true, ); } @@ -1212,8 +1214,8 @@ mod test { fn test_json_default() { test_query_to_logical_ast_with_default_json( "titi:4", - "(Term(type=Json, field=14, path=titi, vtype=U64, 4) Term(type=Json, field=14, \ - path=titi, vtype=Str, \"4\"))", + "(Term(field=14, type=Json, path=titi, type=U64, 4) Term(field=14, type=Json, \ + path=titi, type=Str, \"4\"))", false, ); } @@ -1223,7 +1225,7 @@ mod test { for conjunction in [false, true] { test_query_to_logical_ast_with_default_json( "text:4", - r#"Term(type=Str, field=1, "4")"#, + r#"Term(field=1, type=Str, "4")"#, conjunction, ); } @@ -1234,7 +1236,7 @@ mod test { for conjunction in [false, true] { test_query_to_logical_ast_with_default_json( "json:4", - r#"(Term(type=Json, field=14, path=, vtype=U64, 4) Term(type=Json, field=14, path=, vtype=Str, "4"))"#, + r#"(Term(field=14, type=Json, path=, type=U64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#, conjunction, ); } @@ -1244,7 +1246,7 @@ mod test { fn test_parse_bytes_phrase() { test_parse_query_to_logical_ast_helper( "bytes:\"YnVidQ==\"", - "Term(type=Bytes, field=12, [98, 117, 98, 117])", + "Term(field=12, type=Bytes, [98, 117, 98, 117])", false, ); } @@ -1260,12 +1262,12 @@ mod test { fn test_parse_query_to_ast_ab_c() { test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#, + r#"((+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b")) Term(field=0, type=Str, "c"))"#, false, ); test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#, + r#"(+(+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b")) +Term(field=0, type=Str, "c"))"#, true, ); } @@ -1274,17 +1276,17 @@ mod test { pub fn test_parse_query_to_ast_single_term() { test_parse_query_to_logical_ast_helper( "title:toto", - r#"Term(type=Str, field=0, "toto")"#, + r#"Term(field=0, type=Str, "toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto", - r#"Term(type=Str, field=0, "toto")"#, + r#"Term(field=0, type=Str, "toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#, + r#"(+Term(field=0, type=Str, "toto") -(Term(field=0, type=Str, "titi") Term(field=1, type=Str, "titi")))"#, false, ); } @@ -1301,12 +1303,12 @@ mod test { pub fn test_parse_query_to_ast_two_terms() { test_parse_query_to_logical_ast_helper( "title:a b", - r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#, + r#"(Term(field=0, type=Str, "a") (Term(field=0, type=Str, "b") Term(field=1, type=Str, "b")))"#, false, ); test_parse_query_to_logical_ast_helper( r#"title:"a b""#, - r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#, + r#""[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b"))]""#, false, ); } @@ -1329,37 +1331,37 @@ mod test { pub fn test_parse_query_to_ast_ranges() { test_parse_query_to_logical_ast_helper( "title:[a TO b]", - r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#, + r#"(Included(Term(field=0, type=Str, "a")) TO Included(Term(field=0, type=Str, "b")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO toto}", - r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#, + r#"(Excluded(Term(field=0, type=Str, "titi")) TO Excluded(Term(field=0, type=Str, "toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{* TO toto}", - r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#, + r#"(Unbounded TO Excluded(Term(field=0, type=Str, "toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO *}", - r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#, + r#"(Excluded(Term(field=0, type=Str, "titi")) TO Unbounded)"#, false, ); test_parse_query_to_logical_ast_helper( "signed:{-5 TO 3}", - r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#, + r#"(Excluded(Term(field=2, type=I64, -5)) TO Excluded(Term(field=2, type=I64, 3)))"#, false, ); test_parse_query_to_logical_ast_helper( "float:{-1.5 TO 1.5}", - r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#, + r#"(Excluded(Term(field=10, type=F64, -1.5)) TO Excluded(Term(field=10, type=F64, 1.5)))"#, false, ); test_parse_query_to_logical_ast_helper( "u64_ff:[7 TO 77]", - r#"(Included(Term(type=U64, field=18, 7)) TO Included(Term(type=U64, field=18, 77)))"#, + r#"(Included(Term(field=18, type=U64, 7)) TO Included(Term(field=18, type=U64, 77)))"#, false, ); } @@ -1462,12 +1464,12 @@ mod test { ); test_parse_query_to_logical_ast_helper( r#"date:"2010-11-21T09:55:06.000000000+02:00""#, - r#"Term(type=Date, field=9, 2010-11-21T07:55:06Z)"#, + r#"Term(field=9, type=Date, 2010-11-21T07:55:06Z)"#, true, ); test_parse_query_to_logical_ast_helper( r#"date:"1985-04-12T23:20:50.52Z""#, - r#"Term(type=Date, field=9, 1985-04-12T23:20:50Z)"#, + r#"Term(field=9, type=Date, 1985-04-12T23:20:50Z)"#, true, ); } @@ -1508,27 +1510,27 @@ mod test { pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( "title:toto", - r#"Term(type=Str, field=0, "toto")"#, + r#"Term(field=0, type=Str, "toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto", - r#"Term(type=Str, field=0, "toto")"#, + r#"Term(field=0, type=Str, "toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#, + r#"(+Term(field=0, type=Str, "toto") -(Term(field=0, type=Str, "titi") Term(field=1, type=Str, "titi")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:a b", - r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#, + r#"(+Term(field=0, type=Str, "a") +(Term(field=0, type=Str, "b") Term(field=1, type=Str, "b")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:\"a b\"", - r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#, + r#""[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b"))]""#, true, ); } @@ -1537,7 +1539,7 @@ mod test { pub fn test_query_parser_hyphen() { test_parse_query_to_logical_ast_helper( "title:www-form-encoded", - r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#, + r#""[(0, Term(field=0, type=Str, "www")), (1, Term(field=0, type=Str, "form")), (2, Term(field=0, type=Str, "encoded"))]""#, false, ); } @@ -1547,7 +1549,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a AND title:b", - r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#, + r#"(+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b"))"#, default_conjunction, ); } @@ -1558,7 +1560,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a OR title:b", - r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#, + r#"(Term(field=0, type=Str, "a") Term(field=0, type=Str, "b"))"#, default_conjunction, ); } @@ -1573,7 +1575,7 @@ mod test { let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap(); assert_eq!( format!("{:?}", query), - "TermQuery(Term(type=Str, field=0, \"hello\"))" + "TermQuery(Term(field=0, type=Str, \"hello\"))" ); } @@ -1614,17 +1616,17 @@ mod test { pub fn test_phrase_slop() { test_parse_query_to_logical_ast_helper( "\"a b\"~0", - r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]" "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]")"#, + r#"("[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b"))]" "[(0, Term(field=1, type=Str, "a")), (1, Term(field=1, type=Str, "b"))]")"#, false, ); test_parse_query_to_logical_ast_helper( "\"a b\"~2", - r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]"~2 "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]"~2)"#, + r#"("[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b"))]"~2 "[(0, Term(field=1, type=Str, "a")), (1, Term(field=1, type=Str, "b"))]"~2)"#, false, ); test_parse_query_to_logical_ast_helper( "title:\"a b~4\"~2", - r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b")), (2, Term(type=Str, field=0, "4"))]"~2"#, + r#""[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b")), (2, Term(field=0, type=Str, "4"))]"~2"#, false, ); } @@ -1633,23 +1635,23 @@ mod test { pub fn test_term_set_query() { test_parse_query_to_logical_ast_helper( "title: IN [a b cd]", - r#"IN [Term(type=Str, field=0, "a"), Term(type=Str, field=0, "b"), Term(type=Str, field=0, "cd")]"#, + r#"IN [Term(field=0, type=Str, "a"), Term(field=0, type=Str, "b"), Term(field=0, type=Str, "cd")]"#, false, ); test_parse_query_to_logical_ast_helper( "bytes: IN [AA== ABA= ABCD]", - r#"IN [Term(type=Bytes, field=12, [0]), Term(type=Bytes, field=12, [0, 16]), Term(type=Bytes, field=12, [0, 16, 131])]"#, + r#"IN [Term(field=12, type=Bytes, [0]), Term(field=12, type=Bytes, [0, 16]), Term(field=12, type=Bytes, [0, 16, 131])]"#, false, ); test_parse_query_to_logical_ast_helper( "signed: IN [1 2 -3]", - r#"IN [Term(type=I64, field=2, 1), Term(type=I64, field=2, 2), Term(type=I64, field=2, -3)]"#, + r#"IN [Term(field=2, type=I64, 1), Term(field=2, type=I64, 2), Term(field=2, type=I64, -3)]"#, false, ); test_parse_query_to_logical_ast_helper( "float: IN [1.1 2.2 -3.3]", - r#"IN [Term(type=F64, field=10, 1.1), Term(type=F64, field=10, 2.2), Term(type=F64, field=10, -3.3)]"#, + r#"IN [Term(field=10, type=F64, 1.1), Term(field=10, type=F64, 2.2), Term(field=10, type=F64, -3.3)]"#, false, ); } @@ -1667,9 +1669,9 @@ mod test { let query = query_parser.parse_query("abc").unwrap(); assert_eq!( format!("{:?}", query), - "BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(type=Str, \ - field=0, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \ - (Should, TermQuery(Term(type=Str, field=1, \"abc\")))] }" + "BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \ + type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \ + (Should, TermQuery(Term(field=1, type=Str, \"abc\")))] }" ); } @@ -1684,8 +1686,8 @@ mod test { let query = query_parser.parse_query("abc").unwrap(); assert_eq!( format!("{:?}", query), - "BooleanQuery { subqueries: [(Should, TermQuery(Term(type=Str, field=0, \ - \"abc\"))), (Should, FuzzyTermQuery { term: Term(type=Str, field=1, \"abc\"), \ + "BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \ + \"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \ distance: 2, transposition_cost_one: false, prefix: true })] }" ); } diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index 61b65f34c..8e080368f 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -85,7 +85,7 @@ impl RangeQuery { left_bound: &Bound, right_bound: &Bound, ) -> RangeQuery { - let verify_and_unwrap_term = |val: &Term| val.value_bytes().to_owned(); + let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned(); RangeQuery { field, value_type, @@ -121,7 +121,7 @@ impl RangeQuery { ) -> RangeQuery { let make_term_val = |val: &i64| { Term::from_field_i64(Field::from_field_id(0), *val) - .value_bytes() + .serialized_value_bytes() .to_owned() }; RangeQuery { @@ -159,7 +159,7 @@ impl RangeQuery { ) -> RangeQuery { let make_term_val = |val: &f64| { Term::from_field_f64(Field::from_field_id(0), *val) - .value_bytes() + .serialized_value_bytes() .to_owned() }; RangeQuery { @@ -185,7 +185,7 @@ impl RangeQuery { ) -> RangeQuery { let make_term_val = |val: &u64| { Term::from_field_u64(Field::from_field_id(0), *val) - .value_bytes() + .serialized_value_bytes() .to_owned() }; RangeQuery { @@ -208,7 +208,7 @@ impl RangeQuery { ) -> RangeQuery { let make_term_val = |val: &Ipv6Addr| { Term::from_field_ip_addr(Field::from_field_id(0), *val) - .value_bytes() + .serialized_value_bytes() .to_owned() }; RangeQuery { @@ -246,7 +246,7 @@ impl RangeQuery { ) -> RangeQuery { let make_term_val = |val: &DateTime| { Term::from_field_date(Field::from_field_id(0), *val) - .value_bytes() + .serialized_value_bytes() .to_owned() }; RangeQuery { diff --git a/src/query/set_query.rs b/src/query/set_query.rs index 65f97fd6c..ab62c5044 100644 --- a/src/query/set_query.rs +++ b/src/query/set_query.rs @@ -47,8 +47,12 @@ impl TermSetQuery { // In practice this won't fail because: // - we are writing to memory, so no IoError // - Terms are ordered - let map = Map::from_iter(sorted_terms.iter().map(|key| (key.value_bytes(), 0))) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + let map = Map::from_iter( + sorted_terms + .iter() + .map(|key| (key.serialized_value_bytes(), 0)), + ) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; sub_queries.push(( Occur::Should, diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 8ea4b1ce2..a017b6035 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -175,7 +175,7 @@ mod tests { ); assert_eq!( format!("{:?}", term_query), - r#"TermQuery(Term(type=Str, field=1, "hello"))"# + r#"TermQuery(Term(field=1, type=Str, "hello"))"# ); } diff --git a/src/schema/date_time_options.rs b/src/schema/date_time_options.rs index fb73ad808..dc6b28dce 100644 --- a/src/schema/date_time_options.rs +++ b/src/schema/date_time_options.rs @@ -5,6 +5,9 @@ use serde::{Deserialize, Serialize}; use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; +/// The precision of the indexed date/time values in the inverted index. +pub const DATE_TIME_PRECISION_INDEXED: DatePrecision = DatePrecision::Seconds; + /// Defines how DateTime field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct DateOptions { @@ -85,7 +88,8 @@ impl DateOptions { self } - /// Sets the precision for this DateTime field. + /// Sets the precision for this DateTime field on the fast field. + /// Indexed precision is always [`DATE_TIME_PRECISION_INDEXED`]. /// /// Internal storage precision, used to optimize storage /// compression on fast fields. diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 5fc1159bd..3e63249df 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -129,7 +129,7 @@ mod value; use columnar::ColumnType; pub use self::bytes_options::BytesOptions; -pub use self::date_time_options::{DateOptions, DatePrecision}; +pub use self::date_time_options::{DateOptions, DatePrecision, DATE_TIME_PRECISION_INDEXED}; pub use self::document::Document; pub(crate) use self::facet::FACET_SEP_BYTE; pub use self::facet::{Facet, FacetParseError}; @@ -147,7 +147,7 @@ pub use self::named_field_document::NamedFieldDocument; pub use self::numeric_options::IntOptions; pub use self::numeric_options::NumericOptions; pub use self::schema::{DocParsingError, Schema, SchemaBuilder}; -pub use self::term::Term; +pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH}; pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT}; pub use self::value::Value; diff --git a/src/schema/term.rs b/src/schema/term.rs index da5d236e9..e1411e7c0 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -5,10 +5,11 @@ use std::{fmt, str}; use columnar::MonotonicallyMappableToU128; +use super::date_time_options::DATE_TIME_PRECISION_INDEXED; use super::Field; use crate::fastfield::FastValue; use crate::schema::{Facet, Type}; -use crate::{DatePrecision, DateTime}; +use crate::DateTime; /// Separates the different segments of a json path. pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8; @@ -20,8 +21,12 @@ pub const JSON_PATH_SEGMENT_SEP_STR: &str = pub const JSON_END_OF_PATH: u8 = 0u8; /// Term represents the value that the token can take. +/// It's a serialized representation over different types. /// -/// It actually wraps a `Vec`. +/// It actually wraps a `Vec`. The first 5 bytes are metadata. +/// 4 bytes are the field id, and the last byte is the type. +/// +/// The serialized value `ValueBytes` is considered everything after the 4 first bytes (term id). #[derive(Clone)] pub struct Term>(B) where B: AsRef<[u8]>; @@ -100,7 +105,7 @@ impl Term { /// Builds a term given a field, and a `DateTime` value pub fn from_field_date(field: Field, val: DateTime) -> Term { - Term::from_fast_value(field, &val.truncate(DatePrecision::Seconds)) + Term::from_fast_value(field, &val.truncate(DATE_TIME_PRECISION_INDEXED)) } /// Creates a `Term` given a facet. @@ -186,11 +191,6 @@ impl Term { self.0.truncate(len + TERM_METADATA_LENGTH); } - /// Returns the value bytes as mutable slice - pub fn value_bytes_mut(&mut self) -> &mut [u8] { - &mut self.0[TERM_METADATA_LENGTH..] - } - /// The length of the bytes. pub fn len_bytes(&self) -> usize { self.0.len() - TERM_METADATA_LENGTH @@ -206,44 +206,25 @@ impl Term { &mut self.0[len_before..] } - /// Appends a single byte to the term. + /// Appends a JSON_PATH_SEGMENT_SEP to the term. + /// Only used for JSON type. #[inline] - pub fn push_byte(&mut self, byte: u8) { - self.0.push(byte); + pub fn add_json_path_separator(&mut self) { + self.0.push(JSON_PATH_SEGMENT_SEP); } -} - -impl Ord for Term -where B: AsRef<[u8]> -{ - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.as_slice().cmp(other.as_slice()) + /// Sets the current end to JSON_END_OF_PATH. + /// Only used for JSON type. + #[inline] + pub fn set_json_path_end(&mut self) { + let buffer_len = self.0.len(); + self.0[buffer_len - 1] = JSON_END_OF_PATH; } -} - -impl PartialOrd for Term -where B: AsRef<[u8]> -{ - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for Term -where B: AsRef<[u8]> -{ - fn eq(&self, other: &Self) -> bool { - self.as_slice() == other.as_slice() - } -} - -impl Eq for Term where B: AsRef<[u8]> {} - -impl Hash for Term -where B: AsRef<[u8]> -{ - fn hash(&self, state: &mut H) { - self.0.as_ref().hash(state) + /// Sets the current end to JSON_PATH_SEGMENT_SEP. + /// Only used for JSON type. + #[inline] + pub fn set_json_path_separator(&mut self) { + let buffer_len = self.0.len(); + self.0[buffer_len - 1] = JSON_PATH_SEGMENT_SEP; } } @@ -255,11 +236,68 @@ where B: AsRef<[u8]> Term(data) } + /// Return the type of the term. + pub fn typ(&self) -> Type { + self.value().typ() + } + + /// Returns the field. + pub fn field(&self) -> Field { + let field_id_bytes: [u8; 4] = (&self.0.as_ref()[..4]).try_into().unwrap(); + Field::from_field_id(u32::from_be_bytes(field_id_bytes)) + } + + /// Returns the serialized representation of the value. + /// (this does neither include the field id nor the value type.) + /// + /// If the term is a string, its value is utf-8 encoded. + /// If the term is a u64, its value is encoded according + /// to `byteorder::BigEndian`. + pub fn serialized_value_bytes(&self) -> &[u8] { + &self.0.as_ref()[TERM_METADATA_LENGTH..] + } + + /// Returns the value of the term. + /// address or JSON path + value. (this does not include the field.) + pub fn value(&self) -> ValueBytes<&[u8]> { + ValueBytes::wrap(&self.0.as_ref()[4..]) + } + + /// Returns the serialized representation of Term. + /// This includes field_id, value type and value. + /// + /// Do NOT rely on this byte representation in the index. + /// This value is likely to change in the future. + pub fn serialized_term(&self) -> &[u8] { + self.0.as_ref() + } +} + +/// ValueBytes represents a serialized value. +/// The value can be of any type of [`Type`] (e.g. string, u64, f64, bool, date, JSON). +/// The serialized representation matches the lexographical order of the type. +/// +/// The `ValueBytes` format is as follow: +/// `[type code: u8][serialized value]` +/// +/// For JSON `ValueBytes` equals to: +/// `[type code=JSON][JSON path][JSON_END_OF_PATH][ValueBytes]` +/// +/// The nested ValueBytes in JSON is never of type JSON. (there's no recursion) +#[derive(Clone)] +pub struct ValueBytes(B) +where B: AsRef<[u8]>; + +impl ValueBytes +where B: AsRef<[u8]> +{ + /// Wraps a object holding bytes + pub fn wrap(data: B) -> ValueBytes { + ValueBytes(data) + } + fn typ_code(&self) -> u8 { - *self - .as_slice() - .get(4) - .expect("the byte representation is too short") + self.0.as_ref()[0] } /// Return the type of the term. @@ -267,13 +305,6 @@ where B: AsRef<[u8]> Type::from_code(self.typ_code()).expect("The term has an invalid type code") } - /// Returns the field. - pub fn field(&self) -> Field { - let mut field_id_bytes = [0u8; 4]; - field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]); - Field::from_field_id(u32::from_be_bytes(field_id_bytes)) - } - /// Returns the `u64` value stored in a term. /// /// Returns `None` if the term is not of the u64 type, or if the term byte representation @@ -286,13 +317,8 @@ where B: AsRef<[u8]> if self.typ() != T::to_type() { return None; } - let mut value_bytes = [0u8; 8]; - let bytes = self.value_bytes(); - if bytes.len() != 8 { - return None; - } - value_bytes.copy_from_slice(self.value_bytes()); - let value_u64 = u64::from_be_bytes(value_bytes); + let value_bytes = self.value_bytes(); + let value_u64 = u64::from_be_bytes(value_bytes.try_into().ok()?); Some(T::from_u64(value_u64)) } @@ -361,23 +387,133 @@ where B: AsRef<[u8]> Some(self.value_bytes()) } - /// Returns the serialized value of the term. - /// (this does not include the field.) - /// - /// If the term is a string, its value is utf-8 encoded. - /// If the term is a u64, its value is encoded according - /// to `byteorder::BigEndian`. - pub fn value_bytes(&self) -> &[u8] { - &self.0.as_ref()[TERM_METADATA_LENGTH..] + /// Returns a `Ipv6Addr` value from the term. + pub fn as_ip_addr(&self) -> Option { + if self.typ() != Type::IpAddr { + return None; + } + let ip_u128 = u128::from_be_bytes(self.value_bytes().try_into().ok()?); + Some(Ipv6Addr::from_u128(ip_u128)) } - /// Returns the underlying `&[u8]`. + /// Returns the json path (without non-human friendly separators), + /// and the encoded ValueBytes after the json path. + /// + /// Returns `None` if the value is not JSON. + pub(crate) fn as_json(&self) -> Option<(&str, ValueBytes<&[u8]>)> { + if self.typ() != Type::Json { + return None; + } + let bytes = self.value_bytes(); + + let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?; + let (json_path_bytes, term) = bytes.split_at(pos); + let json_path = str::from_utf8(json_path_bytes).ok()?; + Some((json_path, ValueBytes::wrap(&term[1..]))) + } + + /// Returns the encoded ValueBytes after the json path. + /// + /// Returns `None` if the value is not JSON. + pub(crate) fn as_json_value_bytes(&self) -> Option> { + if self.typ() != Type::Json { + return None; + } + let bytes = self.value_bytes(); + let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?; + Some(ValueBytes::wrap(&bytes[pos + 1..])) + } + + /// Returns the serialized value of ValueBytes without the type. + fn value_bytes(&self) -> &[u8] { + &self.0.as_ref()[1..] + } + + /// Returns the serialized representation of Term. /// /// Do NOT rely on this byte representation in the index. /// This value is likely to change in the future. - pub fn as_slice(&self) -> &[u8] { + pub fn as_serialized(&self) -> &[u8] { self.0.as_ref() } + + fn debug_value_bytes(&self, f: &mut fmt::Formatter) -> fmt::Result { + let typ = self.typ(); + write!(f, "type={typ:?}, ")?; + match typ { + Type::Str => { + let s = self.as_str(); + write_opt(f, s)?; + } + Type::U64 => { + write_opt(f, self.as_u64())?; + } + Type::I64 => { + write_opt(f, self.as_i64())?; + } + Type::F64 => { + write_opt(f, self.as_f64())?; + } + Type::Bool => { + write_opt(f, self.as_bool())?; + } + // TODO pretty print these types too. + Type::Date => { + write_opt(f, self.as_date())?; + } + Type::Facet => { + write_opt(f, self.as_facet())?; + } + Type::Bytes => { + write_opt(f, self.as_bytes())?; + } + Type::Json => { + if let Some((path, sub_value_bytes)) = self.as_json() { + let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, "."); + write!(f, "path={path_pretty}, ")?; + sub_value_bytes.debug_value_bytes(f)?; + } + } + Type::IpAddr => { + write_opt(f, self.as_ip_addr())?; + } + } + Ok(()) + } +} + +impl Ord for Term +where B: AsRef<[u8]> +{ + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.serialized_term().cmp(other.serialized_term()) + } +} + +impl PartialOrd for Term +where B: AsRef<[u8]> +{ + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for Term +where B: AsRef<[u8]> +{ + fn eq(&self, other: &Self) -> bool { + self.serialized_term() == other.serialized_term() + } +} + +impl Eq for Term where B: AsRef<[u8]> {} + +impl Hash for Term +where B: AsRef<[u8]> +{ + fn hash(&self, state: &mut H) { + self.0.as_ref().hash(state) + } } fn write_opt(f: &mut fmt::Formatter, val_opt: Option) -> fmt::Result { @@ -387,80 +523,14 @@ fn write_opt(f: &mut fmt::Formatter, val_opt: Option) -> Ok(()) } -fn as_str(value_bytes: &[u8]) -> Option<&str> { - std::str::from_utf8(value_bytes).ok() -} - -fn get_fast_type(bytes: &[u8]) -> Option { - let value_u64 = u64::from_be_bytes(bytes.try_into().ok()?); - Some(T::from_u64(value_u64)) -} - -/// Returns the json path (without non-human friendly separators, the type of the value, and the -/// value bytes). Returns `None` if the value is not JSON or is not valid. -pub(crate) fn as_json_path_type_value_bytes(bytes: &[u8]) -> Option<(&str, Type, &[u8])> { - let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?; - let json_path = str::from_utf8(&bytes[..pos]).ok()?; - let type_code = *bytes.get(pos + 1)?; - let typ = Type::from_code(type_code)?; - Some((json_path, typ, &bytes[pos + 2..])) -} - -fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Result { - match typ { - Type::Str => { - let s = as_str(bytes); - write_opt(f, s)?; - } - Type::U64 => { - write_opt(f, get_fast_type::(bytes))?; - } - Type::I64 => { - write_opt(f, get_fast_type::(bytes))?; - } - Type::F64 => { - write_opt(f, get_fast_type::(bytes))?; - } - Type::Bool => { - write_opt(f, get_fast_type::(bytes))?; - } - // TODO pretty print these types too. - Type::Date => { - write_opt(f, get_fast_type::(bytes))?; - } - Type::Facet => { - let facet_str = str::from_utf8(bytes) - .ok() - .map(ToString::to_string) - .map(Facet::from_encoded_string) - .map(|facet| facet.to_path_string()); - write_opt(f, facet_str)?; - } - Type::Bytes => { - write_opt(f, Some(bytes))?; - } - Type::Json => { - if let Some((path, typ, bytes)) = as_json_path_type_value_bytes(bytes) { - let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, "."); - write!(f, "path={path_pretty}, vtype={typ:?}, ")?; - debug_value_bytes(typ, bytes, f)?; - } - } - Type::IpAddr => { - write!(f, "")?; // TODO change once we actually have IP address terms. - } - } - Ok(()) -} - impl fmt::Debug for Term where B: AsRef<[u8]> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let field_id = self.field().field_id(); - let typ = self.typ(); - write!(f, "Term(type={typ:?}, field={field_id}, ")?; - debug_value_bytes(typ, self.value_bytes(), f)?; + write!(f, "Term(field={field_id}, ")?; + let value_bytes = ValueBytes::wrap(&self.0.as_ref()[4..]); + value_bytes.debug_value_bytes(f)?; write!(f, ")",)?; Ok(()) } @@ -479,7 +549,7 @@ mod tests { let term = Term::from_field_text(title_field, "test"); assert_eq!(term.field(), title_field); assert_eq!(term.typ(), Type::Str); - assert_eq!(term.as_str(), Some("test")) + assert_eq!(term.value().as_str(), Some("test")) } /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. @@ -501,8 +571,8 @@ mod tests { let term = Term::from_field_u64(count_field, 983u64); assert_eq!(term.field(), count_field); assert_eq!(term.typ(), Type::U64); - assert_eq!(term.as_slice().len(), FAST_VALUE_TERM_LEN); - assert_eq!(term.as_u64(), Some(983u64)) + assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN); + assert_eq!(term.value().as_u64(), Some(983u64)) } #[test] @@ -512,7 +582,7 @@ mod tests { let term = Term::from_field_bool(bool_field, true); assert_eq!(term.field(), bool_field); assert_eq!(term.typ(), Type::Bool); - assert_eq!(term.as_slice().len(), FAST_VALUE_TERM_LEN); - assert_eq!(term.as_bool(), Some(true)) + assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN); + assert_eq!(term.value().as_bool(), Some(true)) } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index a63527849..09664968e 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -310,7 +310,8 @@ impl SnippetGenerator { }); let mut terms_text: BTreeMap = Default::default(); for term in terms { - let term_str = if let Some(term_str) = term.as_str() { + let term_value = term.value(); + let term_str = if let Some(term_str) = term_value.as_str() { term_str } else { continue;