diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 34b7b7daa..d8a1ea9b8 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -307,16 +307,16 @@ impl IndexMerger { } None => {} }, - FieldType::Str(_) => { - // We don't handle str fast field for the moment - // They can be implemented using what is done - // for facets in the future. - } FieldType::Bytes(byte_options) => { if byte_options.is_fast() { self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?; } } + FieldType::Str(_) | FieldType::JsonObject(_) => { + // We don't handle json / string fast field for the moment + // They can be implemented using what is done + // for facets in the future + } } } Ok(()) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 746747e5a..0dccc3ef7 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -276,6 +276,9 @@ impl SegmentWriter { postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context); } } + FieldType::JsonObject(_) => { + unimplemented!() + } } } Ok(()) diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs new file mode 100644 index 000000000..9698b0ec3 --- /dev/null +++ b/src/postings/json_postings_writer.rs @@ -0,0 +1,52 @@ +use std::io; +use crate::Term; +use crate::indexer::doc_id_mapping::DocIdMapping; +use crate::postings::postings_writer::SpecializedPostingsWriter; +use crate::postings::recorder::{Recorder, NothingRecorder}; +use crate::postings::stacker::Addr; +use crate::postings::{PostingsWriter, IndexingContext, UnorderedTermId, FieldSerializer}; + +pub struct JsonPostingsWriter { + str_posting_writer: Box, + non_str_posting_writer: Box, +} + +impl JsonPostingsWriter { + pub(crate) fn new() -> Self { + JsonPostingsWriter { + str_posting_writer: SpecializedPostingsWriter::::new_boxed(), + non_str_posting_writer: SpecializedPostingsWriter::::new_boxed(), + } + } + +} + +impl PostingsWriter for JsonPostingsWriter { + fn subscribe( + &mut self, + doc: crate::DocId, + pos: u32, + term: &crate::Term, + ctx: &mut IndexingContext, + ) -> UnorderedTermId { + let term_type = term.typ(); + todo!() + } + + /// The actual serialization format is handled by the `PostingsSerializer`. + fn serialize( + &self, + term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], + doc_id_map: Option<&DocIdMapping>, + indexing_context: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + todo!() + } + + fn total_num_tokens(&self) -> u64 { + todo!() + } +} + + diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 4df8beee7..f9ba7e731 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -15,6 +15,7 @@ mod segment_postings; mod serializer; mod skip; mod stacker; +mod json_postings_writer; mod term_info; pub use self::block_segment_postings::BlockSegmentPostings; diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index 4c32cb51e..8b17dd483 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -1,3 +1,4 @@ +use crate::postings::json_postings_writer::JsonPostingsWriter; use crate::postings::postings_writer::SpecializedPostingsWriter; use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder}; use crate::postings::PostingsWriter; @@ -49,5 +50,20 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box SpecializedPostingsWriter::::new_boxed(), + FieldType::JsonObject(ref json_object_options) => { + Box::new(if let Some(text_indexing_option) = json_object_options.get_text_indexing_option() { + match text_indexing_option.index_option() { + IndexRecordOption::Basic => JsonPostingsWriter::new::(), + IndexRecordOption::WithFreqs => { + JsonPostingsWriter::new::() + } + IndexRecordOption::WithFreqsAndPositions => { + JsonPostingsWriter::new::() + } + } + } else { + JsonPostingsWriter::new::() + }) + }, } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index bee4c301c..2b52ea913 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -2,13 +2,12 @@ use std::collections::HashMap; use std::io; use std::marker::PhantomData; use std::ops::Range; - use fnv::FnvHashMap; use super::stacker::Addr; use crate::fieldnorm::FieldNormReaders; use crate::indexer::doc_id_mapping::DocIdMapping; -use crate::postings::recorder::{BufferLender, Recorder}; +use crate::postings::recorder::{BufferLender, Recorder, NothingRecorder}; use crate::postings::{ FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter, UnorderedTermId, @@ -85,6 +84,7 @@ pub(crate) fn serialize_postings( } FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {} FieldType::Bytes(_) => {} + FieldType::JsonObject(_) => {} } let postings_writer = per_field_postings_writers.get_for_field(field); @@ -175,29 +175,121 @@ pub(crate) trait PostingsWriter { fn total_num_tokens(&self) -> u64; } +pub(crate) struct JsonPostingsWriter { + text_postings_writer: SpecializedPostingsWriter, + other_postings_writer: SpecializedPostingsWriter, +} + +impl JsonPostingsWriter { + pub fn new_boxed() -> Box { + let text_postings_writer: SpecializedPostingsWriter = SpecializedPostingsWriter { + total_num_tokens: 0u64, + _recorder_type: PhantomData, + }; + let other_postings_writer: SpecializedPostingsWriter = + SpecializedPostingsWriter { + total_num_tokens: 0u64, + _recorder_type: PhantomData, + }; + Box::new(JsonPostingsWriter { + text_postings_writer, + other_postings_writer, + }) + } +} + +impl PostingsWriter for JsonPostingsWriter { + fn subscribe( + &mut self, + doc: DocId, + pos: u32, + term: &Term, + ctx: &mut IndexingContext + ) -> UnorderedTermId { + // TODO will the unordered term id be correct!? + debug_assert!(term.is_json()); + if term.typ() == Type::Str { + self.text_postings_writer + .subscribe(doc, pos, term, ctx) + } else { + self.other_postings_writer + .subscribe(doc, pos, term, ctx) + } + } + + fn serialize( + &self, + term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], + doc_id_map: Option<&DocIdMapping>, + ctx: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + let mut buffer_lender = BufferLender::default(); + for (term, addr, _) in term_addrs { + if term.typ() == Type::Str { + SpecializedPostingsWriter::::serialize_one_term( + term, + *addr, + doc_id_map, + &mut buffer_lender, + ctx, + serializer, + )?; + } else { + SpecializedPostingsWriter::::serialize_one_term( + term, + *addr, + doc_id_map, + &mut buffer_lender, + ctx, + serializer, + )?; + } + } + Ok(()) + } + + fn total_num_tokens(&self) -> u64 { + self.text_postings_writer.total_num_tokens() + self.other_postings_writer.total_num_tokens() + } +} + /// The `SpecializedPostingsWriter` is just here to remove dynamic /// dispatch to the recorder information. -pub(crate) struct SpecializedPostingsWriter { +#[derive(Default)] +pub(crate) struct SpecializedPostingsWriter { total_num_tokens: u64, _recorder_type: PhantomData, } -impl SpecializedPostingsWriter { - /// constructor - pub fn new() -> SpecializedPostingsWriter { - SpecializedPostingsWriter { +impl SpecializedPostingsWriter { + pub fn new_boxed() -> Box { + let new_specialized_posting_writer: Self = Self { total_num_tokens: 0u64, _recorder_type: PhantomData, - } + }; + Box::new(new_specialized_posting_writer) } - /// Builds a `SpecializedPostingsWriter` storing its data in a memory arena. - pub fn new_boxed() -> Box { - Box::new(SpecializedPostingsWriter::::new()) + #[inline] + fn serialize_one_term( + term: &Term<&[u8]>, + addr: Addr, + doc_id_map: Option<&DocIdMapping>, + buffer_lender: &mut BufferLender, + ctx: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + let recorder: Rec = ctx.arena.read(addr); + let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); + serializer.new_term(term.value_bytes(), term_doc_freq)?; + recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender); + serializer.close_term()?; + Ok(()) } } -impl PostingsWriter for SpecializedPostingsWriter { +impl PostingsWriter for SpecializedPostingsWriter { fn subscribe( &mut self, doc: DocId, @@ -233,21 +325,19 @@ impl PostingsWriter for SpecializedPostingsWriter &self, term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], doc_id_map: Option<&DocIdMapping>, - indexing_context: &IndexingContext, + ctx: &IndexingContext, serializer: &mut FieldSerializer, ) -> io::Result<()> { let mut buffer_lender = BufferLender::default(); for (term, addr, _) in term_addrs { - let recorder: Rec = indexing_context.term_index.read(*addr); - let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); - serializer.new_term(term.value_bytes(), term_doc_freq)?; - recorder.serialize( - &indexing_context.arena, + Self::serialize_one_term( + term, + *addr, doc_id_map, - serializer, &mut buffer_lender, - ); - serializer.close_term()?; + ctx, + serializer, + )?; } Ok(()) } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 90c37b654..c8fb49ce8 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -76,7 +76,7 @@ impl InvertedIndexSerializer { field: Field, total_num_tokens: u64, fieldnorm_reader: Option, - ) -> io::Result> { + ) -> io::Result { let field_entry: &FieldEntry = self.schema.get_field_entry(field); let term_dictionary_write = self.terms_write.for_field(field); let postings_write = self.postings_write.for_field(field); @@ -203,6 +203,7 @@ impl<'a> FieldSerializer<'a> { self.current_term_info.doc_freq += 1; self.postings_serializer.write_doc(doc_id, term_freq); if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() { + assert_eq!(term_freq as usize, position_deltas.len()); positions_serializer.write_positions_delta(position_deltas); } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 2cfcec7f5..b952c6a85 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -372,6 +372,9 @@ impl QueryParser { let term = Term::from_field_bytes(field, &bytes); Ok(vec![(0, term)]) } + FieldType::JsonObject(_) => { + unimplemented!() + } } } @@ -661,7 +664,7 @@ mod test { let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap(); assert_eq!( format!("{:?}", query), - r#"TermQuery(Term(type=Facet, field=11, val="/root/branch/leaf"))"# + r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"# ); } @@ -674,7 +677,7 @@ mod test { let query = query_parser.parse_query("text:hello").unwrap(); assert_eq!( format!("{:?}", query), - r#"Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2)"# + r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"# ); } @@ -701,7 +704,7 @@ mod test { let query = query_parser.parse_query("text:hello^2").unwrap(); assert_eq!( format!("{:?}", query), - r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2), boost=2)"# + r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"# ); } @@ -736,7 +739,7 @@ mod test { pub fn test_parse_query_untokenized() { test_parse_query_to_logical_ast_helper( "nottokenized:\"wordone wordtwo\"", - r#"Term(type=Str, field=7, val="wordone wordtwo")"#, + r#"Term(type=Str, field=7, "wordone wordtwo")"#, false, ); } @@ -779,7 +782,7 @@ mod test { .is_ok()); test_parse_query_to_logical_ast_helper( "unsigned:2324", - "Term(type=U64, field=3, val=2324)", + "Term(type=U64, field=3, 2324)", false, ); @@ -806,7 +809,7 @@ mod test { fn test_parse_bytes() { test_parse_query_to_logical_ast_helper( "bytes:YnVidQ==", - "Term(type=Bytes, field=12, val=[98, 117, 98, 117])", + "Term(type=Bytes, field=12, [98, 117, 98, 117])", false, ); } @@ -821,7 +824,7 @@ mod test { fn test_parse_bytes_phrase() { test_parse_query_to_logical_ast_helper( "bytes:\"YnVidQ==\"", - "Term(type=Bytes, field=12, val=[98, 117, 98, 117])", + "Term(type=Bytes, field=12, [98, 117, 98, 117])", false, ); } @@ -837,12 +840,12 @@ mod test { fn test_parse_query_to_ast_ab_c() { test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - r#"((+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) Term(type=Str, field=0, val="c"))"#, + r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#, false, ); test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - r#"(+(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) +Term(type=Str, field=0, val="c"))"#, + r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#, true, ); } @@ -851,17 +854,17 @@ mod test { pub fn test_parse_query_to_ast_single_term() { test_parse_query_to_logical_ast_helper( "title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#, + r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#, false, ); } @@ -878,12 +881,12 @@ mod test { pub fn test_parse_query_to_ast_two_terms() { test_parse_query_to_logical_ast_helper( "title:a b", - r#"(Term(type=Str, field=0, val="a") (Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#, + r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#, false, ); test_parse_query_to_logical_ast_helper( r#"title:"a b""#, - r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#, + r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#, false, ); } @@ -892,37 +895,37 @@ mod test { pub fn test_parse_query_to_ast_ranges() { test_parse_query_to_logical_ast_helper( "title:[a TO b]", - r#"(Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b")))"#, + r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#, false, ); test_parse_query_to_logical_ast_helper( "[a TO b]", - r#"((Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b"))) (Included(Term(type=Str, field=1, val="a")) TO Included(Term(type=Str, field=1, val="b"))))"#, + r#"((Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b"))) (Included(Term(type=Str, field=1, "a")) TO Included(Term(type=Str, field=1, "b"))))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO toto}", - r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Excluded(Term(type=Str, field=0, val="toto")))"#, + r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{* TO toto}", - r#"(Unbounded TO Excluded(Term(type=Str, field=0, val="toto")))"#, + r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO *}", - r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Unbounded)"#, + r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#, false, ); test_parse_query_to_logical_ast_helper( "signed:{-5 TO 3}", - r#"(Excluded(Term(type=I64, field=2, val=-5)) TO Excluded(Term(type=I64, field=2, val=3)))"#, + r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#, false, ); test_parse_query_to_logical_ast_helper( "float:{-1.5 TO 1.5}", - r#"(Excluded(Term(type=F64, field=10, val=-1.5)) TO Excluded(Term(type=F64, field=10, val=1.5)))"#, + r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#, false, ); test_parse_query_to_logical_ast_helper("*", "*", false); @@ -1051,27 +1054,27 @@ mod test { pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( "title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#, + r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:a b", - r#"(+Term(type=Str, field=0, val="a") +(Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#, + r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:\"a b\"", - r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#, + r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#, true, ); } @@ -1080,7 +1083,7 @@ mod test { pub fn test_query_parser_hyphen() { test_parse_query_to_logical_ast_helper( "title:www-form-encoded", - r#""[(0, Term(type=Str, field=0, val="www")), (1, Term(type=Str, field=0, val="form")), (2, Term(type=Str, field=0, val="encoded"))]""#, + r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#, false, ); } @@ -1090,7 +1093,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a AND title:b", - r#"(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b"))"#, + r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#, default_conjunction, ); } @@ -1101,7 +1104,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a OR title:b", - r#"(Term(type=Str, field=0, val="a") Term(type=Str, field=0, val="b"))"#, + r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#, default_conjunction, ); } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index f997cdc51..b6c106721 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -174,7 +174,7 @@ mod tests { ); assert_eq!( format!("{:?}", term_query), - r#"TermQuery(Term(type=Str, field=1, val="hello"))"# + r#"TermQuery(Term(type=Str, field=1, "hello"))"# ); } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 099f0ed27..eb835c47a 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -137,6 +137,7 @@ impl FieldEntry { FieldType::Str(ref options) => options.is_stored(), FieldType::Facet(ref options) => options.is_stored(), FieldType::Bytes(ref options) => options.is_stored(), + FieldType::JsonObject(ref options) => options.is_stored(), } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 6b995773a..9b2d51286 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,25 +1,33 @@ +use crate::schema::bytes_options::BytesOptions; +use crate::schema::facet_options::FacetOptions; +use crate::schema::Facet; +use crate::schema::IndexRecordOption; +use crate::schema::JsonObjectOptions; +use crate::schema::TextFieldIndexing; +use crate::schema::Value; +use crate::schema::{IntOptions, TextOptions}; +use crate::tokenizer::PreTokenizedString; use chrono::{FixedOffset, Utc}; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; - -use crate::schema::bytes_options::BytesOptions; -use crate::schema::facet_options::FacetOptions; -use crate::schema::{Facet, IndexRecordOption, IntOptions, TextFieldIndexing, TextOptions, Value}; -use crate::tokenizer::PreTokenizedString; +use thiserror::Error; /// Possible error that may occur while parsing a field value /// At this point the JSON is known to be valid. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Error)] pub enum ValueParsingError { - /// Encountered a numerical value that overflows or underflow its integer type. - OverflowError(String), - /// The json node is not of the correct type. - /// (e.g. 3 for a `Str` type or `"abc"` for a u64 type) - /// Tantivy will try to autocast values. - TypeError(String), - /// The json node is a string but contains json that is - /// not valid base64. - InvalidBase64(String), + #[error("Overflow error. Expected {expected}, got {json}")] + OverflowError { + expected: &'static str, + json: serde_json::Value, + }, + #[error("Type error. Expected {expected}, got {json}")] + TypeError { + expected: &'static str, + json: serde_json::Value, + }, + #[error("Invalid base64: {base64}")] + InvalidBase64 { base64: String }, } /// Type of the value that a field can take. @@ -67,6 +75,18 @@ impl Type { *self as u8 } + pub fn name(&self) -> &'static str { + match self { + Type::Str => "Str", + Type::U64 => "U64", + Type::I64 => "I64", + Type::F64 => "F64", + Type::Date => "Date", + Type::Facet => "Facet", + Type::Bytes => "Bytes", + } + } + /// Interprets a 1byte code as a type. /// Returns None if the code is invalid. pub fn from_code(code: u8) -> Option { @@ -104,6 +124,8 @@ pub enum FieldType { Facet(FacetOptions), /// Bytes (one per document) Bytes(BytesOptions), + /// Json object + JsonObject(JsonObjectOptions), } impl FieldType { @@ -117,6 +139,9 @@ impl FieldType { FieldType::Date(_) => Type::Date, FieldType::Facet(_) => Type::Facet, FieldType::Bytes(_) => Type::Bytes, + FieldType::JsonObject(_) => { + unimplemented!() + } } } @@ -130,6 +155,7 @@ impl FieldType { FieldType::Date(ref date_options) => date_options.is_indexed(), FieldType::Facet(ref _facet_options) => true, FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), + FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(), } } @@ -146,12 +172,17 @@ impl FieldType { | FieldType::Date(ref int_options) => int_options.fieldnorms(), FieldType::Facet(_) => false, FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), + FieldType::JsonObject(ref json_object_options) => false, } } /// Given a field configuration, return the maximal possible /// `IndexRecordOption` available. /// + /// For the Json object, this does not necessarily mean it is the index record + /// option level is available for all terms. + /// (Non string terms have the Basic indexing option at most.) + /// /// If the field is not indexed, then returns `None`. pub fn get_index_record_option(&self) -> Option { match *self { @@ -176,6 +207,10 @@ impl FieldType { None } } + FieldType::JsonObject(ref json_obj_options) => json_obj_options + .indexing + .as_ref() + .map(TextFieldIndexing::index_option), } } @@ -189,25 +224,30 @@ impl FieldType { JsonValue::String(ref field_text) => match *self { FieldType::Date(_) => { let dt_with_fixed_tz: chrono::DateTime = - chrono::DateTime::parse_from_rfc3339(field_text).map_err(|err| { - ValueParsingError::TypeError(format!( - "Failed to parse date from JSON. Expected rfc3339 format, got {}. \ - {:?}", - field_text, err - )) + chrono::DateTime::parse_from_rfc3339(field_text).map_err(|_err| { + ValueParsingError::TypeError { + expected: "rfc3339 format", + json: json.clone(), + } })?; Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc))) } FieldType::Str(_) => Ok(Value::Str(field_text.clone())), - FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err( - ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)), - ), + FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => { + Err(ValueParsingError::TypeError { + expected: "an integer", + json: json.clone(), + }) + } FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))), FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| { - ValueParsingError::InvalidBase64(format!( - "Expected base64 string, got {:?}", - field_text - )) + ValueParsingError::InvalidBase64 { + base64: field_text.clone(), + } + }), + FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { + expected: "a json object", + json: json.clone(), }), }, JsonValue::Number(ref field_val_num) => match *self { @@ -215,30 +255,42 @@ impl FieldType { if let Some(field_val_i64) = field_val_num.as_i64() { Ok(Value::I64(field_val_i64)) } else { - let msg = format!("Expected an i64 int, got {:?}", json); - Err(ValueParsingError::OverflowError(msg)) + Err(ValueParsingError::OverflowError { + expected: "an i64 int", + json: json.clone(), + }) } } FieldType::U64(_) => { if let Some(field_val_u64) = field_val_num.as_u64() { Ok(Value::U64(field_val_u64)) } else { - let msg = format!("Expected a u64 int, got {:?}", json); - Err(ValueParsingError::OverflowError(msg)) + Err(ValueParsingError::OverflowError { + expected: "u64", + json: json.clone(), + }) } } FieldType::F64(_) => { if let Some(field_val_f64) = field_val_num.as_f64() { Ok(Value::F64(field_val_f64)) } else { - let msg = format!("Expected a f64 int, got {:?}", json); - Err(ValueParsingError::OverflowError(msg)) + Err(ValueParsingError::OverflowError { + expected: "a f64", + json: json.clone(), + }) } } FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => { - let msg = format!("Expected a string, got {:?}", json); - Err(ValueParsingError::TypeError(msg)) + Err(ValueParsingError::TypeError { + expected: "a string", + json: json.clone(), + }) } + FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { + expected: "a json object", + json: json.clone(), + }), }, JsonValue::Object(_) => match *self { FieldType::Str(_) => { @@ -247,28 +299,21 @@ impl FieldType { { Ok(Value::PreTokStr(tok_str_val)) } else { - let msg = format!( - "Json value {:?} cannot be translated to PreTokenizedString.", - json - ); - Err(ValueParsingError::TypeError(msg)) + Err(ValueParsingError::TypeError { + expected: "a string or an pretokenized string", + json: json.clone(), + }) } } - _ => { - let msg = format!( - "Json value not supported error {:?}. Expected {:?}", - json, self - ); - Err(ValueParsingError::TypeError(msg)) - } + _ => Err(ValueParsingError::TypeError { + expected: self.value_type().name(), + json: json.clone(), + }), }, - _ => { - let msg = format!( - "Json value not supported error {:?}. Expected {:?}", - json, self - ); - Err(ValueParsingError::TypeError(msg)) - } + _ => Err(ValueParsingError::TypeError { + expected: self.value_type().name(), + json: json.clone(), + }), } } } @@ -317,13 +362,13 @@ mod tests { let result = FieldType::Bytes(Default::default()).value_from_json(&json!(521)); match result { - Err(ValueParsingError::TypeError(_)) => {} + Err(ValueParsingError::TypeError { .. }) => {} _ => panic!("Expected parse failure for wrong type"), } let result = FieldType::Bytes(Default::default()).value_from_json(&json!("-")); match result { - Err(ValueParsingError::InvalidBase64(_)) => {} + Err(ValueParsingError::InvalidBase64 { .. }) => {} _ => panic!("Expected parse failure for invalid base64"), } } diff --git a/src/schema/json_object_options.rs b/src/schema/json_object_options.rs new file mode 100644 index 000000000..14d5680ec --- /dev/null +++ b/src/schema/json_object_options.rs @@ -0,0 +1,25 @@ +use serde::{Deserialize, Serialize}; + +use crate::schema::TextFieldIndexing; + +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct JsonObjectOptions { + stored: bool, + // If set to some, int, date, f64 and text will be indexed. + // Text will use the TextFieldIndexing setting for indexing. + pub(crate) indexing: Option, +} + +impl JsonObjectOptions { + pub fn is_stored(&self) -> bool { + self.stored + } + + pub fn is_indexed(&self) -> bool { + self.indexing.is_some() + } + + pub fn get_text_indexing_option(&self) -> Option<&TextFieldIndexing> { + self.indexing.as_ref() + } +} diff --git a/src/schema/json_object_utils.rs b/src/schema/json_object_utils.rs new file mode 100644 index 000000000..e43cc4f2c --- /dev/null +++ b/src/schema/json_object_utils.rs @@ -0,0 +1,25 @@ +use serde_json::Number; + +use crate::schema::Value; + +pub fn infer_type_from_string(string: &str) -> Value { + // TODO can we avoid the copy? + Value::Str(string.to_string()) +} + +pub fn infer_type_from_number(number: &Number) -> Value { + if let Some(num_val) = number.as_u64() { + return Value::U64(num_val); + } + if let Some(num_val) = number.as_i64() { + return Value::I64(num_val); + } + if let Some(num_val) = number.as_f64() { + return Value::F64(num_val); + } + // This should never happen as long as we + // don't use the serde_json feature = "arbitrary_precision". + Value::Str(number.to_string()) +} + +// TODO add unit tests diff --git a/src/schema/mod.rs b/src/schema/mod.rs index e64ccc438..94eb0555f 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -104,7 +104,7 @@ mod document; mod facet; mod facet_options; mod schema; -mod term; +pub(crate) mod term; mod field_entry; mod field_type; @@ -114,11 +114,14 @@ mod bytes_options; mod field; mod index_record_option; mod int_options; +mod json_object_options; mod named_field_document; mod text_options; mod value; mod flags; +mod json_object_utils; +mod term_writer; pub use self::bytes_options::BytesOptions; pub use self::document::Document; @@ -138,6 +141,8 @@ pub use self::term::Term; pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT}; pub use self::value::Value; +pub use self::json_object_options::JsonObjectOptions; + /// Validator for a potential `field_name`. /// Returns true if the name can be use for a field name. /// diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 4f62d57e6..b501ee70b 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -666,7 +666,7 @@ mod tests { json_err, Err(DocParsingError::ValueError( _, - ValueParsingError::TypeError(_) + ValueParsingError::TypeError { .. } )) ); } @@ -684,7 +684,7 @@ mod tests { json_err, Err(DocParsingError::ValueError( _, - ValueParsingError::OverflowError(_) + ValueParsingError::OverflowError { .. } )) ); } @@ -702,7 +702,7 @@ mod tests { json_err, Err(DocParsingError::ValueError( _, - ValueParsingError::OverflowError(_) + ValueParsingError::OverflowError { .. } )) )); } @@ -720,7 +720,7 @@ mod tests { json_err, Err(DocParsingError::ValueError( _, - ValueParsingError::OverflowError(_) + ValueParsingError::OverflowError { .. } )) ); } diff --git a/src/schema/term.rs b/src/schema/term.rs index c6e3cfbb0..a7e9b3730 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -8,8 +8,27 @@ use crate::DateTime; /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. /// + + +/// +/// - is a big endian encoded u32 field id +/// - 's most significant bit expresses whether the term is a json term or not +/// The remaining 7 bits are used to encode the type of the value. +/// If this is a JSON term, the type is the type of the leaf of the json. +/// +/// - is, if this is not the json term, a binary representation specific to the type. +/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value. const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; +/// The first bit of the `type_byte` +/// is used to encode whether a Term is for +/// json or not. +pub const JSON_MARKER_BIT: u8 = 128u8; +/// Separates the different segments of +/// the json path. +pub const JSON_PATH_SEGMENT_SEP: u8 = 0u8; +/// Separates the json path and the value in +/// a JSON term binary representation. +pub const JSON_END_OF_PATH: u8 = 30u8; + /// Term represents the value that the token can take. /// /// It actually wraps a `Vec`. @@ -164,13 +183,23 @@ where B: AsRef<[u8]> Term(data) } - /// Return the type of the term. - pub fn typ(&self) -> Type { + /// Returns true iff the term is a JSON term. + pub fn is_json(&self) -> bool { + self.typ_code() & JSON_MARKER_BIT == JSON_MARKER_BIT + } + + fn typ_code(&self) -> u8 { assert!( self.as_slice().len() >= 5, - "the type does byte representation is too short" + "the byte representation is too short" ); - Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code") + self.as_slice()[4] + } + + /// Return the type of the term. + pub fn typ(&self) -> Type { + Type::from_code(self.typ_code() & (JSON_MARKER_BIT - 1)) + .expect("The term has an invalid type code") } /// Returns the field. @@ -189,11 +218,15 @@ where B: AsRef<[u8]> } fn get_fast_type(&self) -> Option { - if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN { + if self.typ() != T::to_type() { return None; } let mut value_bytes = [0u8; 8]; - value_bytes.copy_from_slice(self.value_bytes()); + let bytes = self.value_bytes_without_path(); + if bytes.len() != 8 { + return None; + } + value_bytes.copy_from_slice(self.value_bytes_without_path()); let value_u64 = u64::from_be_bytes(value_bytes); Some(FastValue::from_u64(value_u64)) } @@ -206,6 +239,35 @@ where B: AsRef<[u8]> self.get_fast_type::() } + fn json_path_value(&self) -> Option<&str> { + if !self.is_json() { + return None; + } + let value_bytes = self.value_bytes(); + let pos = value_bytes + .iter() + .cloned() + .position(|b| b == JSON_END_OF_PATH) + .expect("Could not find end of path"); + let json_path = + str::from_utf8(&value_bytes[..pos]).expect("JSON value path is Invalid utf-8"); + Some(json_path) + } + + fn value_bytes_without_path(&self) -> &[u8] { + let value_bytes = self.value_bytes(); + if self.is_json() { + let pos = value_bytes + .iter() + .cloned() + .position(|b| b == JSON_END_OF_PATH) + .expect("Could not find end of path"); + &value_bytes[pos + 1..] + } else { + value_bytes + } + } + /// Returns the `f64` value stored in a term. /// /// Returns None if the term is not of the f64 type, or if the term byte representation @@ -233,7 +295,7 @@ where B: AsRef<[u8]> if self.typ() != Type::Str { return None; } - str::from_utf8(self.value_bytes()).ok() + str::from_utf8(self.value_bytes_without_path()).ok() } /// Returns the facet associated with the term. @@ -247,7 +309,7 @@ where B: AsRef<[u8]> if self.typ() != Type::Facet { return None; } - let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?; + let facet_encode_str = str::from_utf8(self.value_bytes_without_path()).ok()?; Some(Facet::from_encoded_string(facet_encode_str.to_string())) } @@ -261,7 +323,7 @@ where B: AsRef<[u8]> if self.typ() != Type::Bytes { return None; } - Some(self.value_bytes()) + Some(self.value_bytes_without_path()) } /// Returns the serialized value of the term. @@ -290,14 +352,24 @@ fn write_opt(f: &mut fmt::Formatter, val_opt: Option) -> Ok(()) } -impl fmt::Debug for Term { +impl fmt::Debug for Term +where + B: AsRef<[u8]>, +{ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let field_id = self.field().field_id(); let typ = self.typ(); - write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?; + write!(f, "Term(type={:?}, field={}, ", typ, field_id)?; + if let Some(path) = self.json_path_value() { + write!( + f, + "path={}, ", + path.replace(std::str::from_utf8(&[JSON_PATH_SEGMENT_SEP]).unwrap(), ".") + )?; + } match typ { Type::Str => { - let s = str::from_utf8(self.value_bytes()).ok(); + let s = self.as_str(); write_opt(f, s)?; } Type::U64 => { diff --git a/src/schema/term_writer.rs b/src/schema/term_writer.rs new file mode 100644 index 000000000..c9c657e46 --- /dev/null +++ b/src/schema/term_writer.rs @@ -0,0 +1,131 @@ +use crate::fastfield::FastValue; +use crate::schema::{Field, Type}; +use crate::Term; + +// Copyright (C) 2021 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// + +use super::term::JSON_MARKER_BIT; +pub struct JsonTermWriter { + buffer: Vec, + path_stack: Vec, +} + +impl JsonTermWriter { + pub fn new() -> Self { + let mut buffer = Vec::with_capacity(100); + buffer.resize(5, 0); + let mut path_stack = Vec::with_capacity(10); + path_stack.push(5); + Self { buffer, path_stack } + } + + pub fn with_field(field: Field) -> Self { + let mut json_term_writer: JsonTermWriter = Self::new(); + json_term_writer.set_field(field); + json_term_writer + } + + fn trim_to_end_of_path(&mut self) { + let end_of_path = *self.path_stack.last().unwrap(); + self.buffer.resize(end_of_path, 0u8); + self.buffer[end_of_path - 1] = super::term::JSON_PATH_SEGMENT_SEP; + } + + fn close_path_and_set_type(&mut self, typ: Type) { + self.trim_to_end_of_path(); + self.buffer[4] = JSON_MARKER_BIT | typ.to_code(); + let end_of_path = self.buffer.len(); + self.buffer[end_of_path - 1] = super::term::JSON_END_OF_PATH; + } + + pub fn push_path_segment(&mut self, segment: &str) { + // the path stack should never be empty. + self.trim_to_end_of_path(); + self.buffer.extend_from_slice(segment.as_bytes()); + self.buffer.push(super::term::JSON_PATH_SEGMENT_SEP); + self.path_stack.push(self.buffer.len()); + } + + pub fn pop_path_segment(&mut self) { + self.path_stack.pop(); + assert!(self.path_stack.len() > 0); + self.trim_to_end_of_path(); + } + + pub fn set_text(&mut self, text: &str) { + self.close_path_and_set_type(Type::Str); + self.buffer.extend_from_slice(text.as_bytes()); + } + + pub fn set_i64(&mut self, val: i64) { + self.close_path_and_set_type(Type::I64); + self.buffer + .extend_from_slice(val.to_u64().to_be_bytes().as_slice()); + } + + pub fn set_field(&mut self, field: Field) { + self.buffer[0..4].copy_from_slice(field.field_id().to_be_bytes().as_ref()); + } + + pub fn term(&self) -> Term<&[u8]> { + Term::wrap(&self.buffer[..]) + } +} + +#[cfg(test)] +mod tests { + use crate::schema::Field; + + use super::JsonTermWriter; + + #[test] + fn test_json_writer() { + let field = Field::from_field_id(1); + let mut json_writer = JsonTermWriter::with_field(field); + json_writer.push_path_segment("attributes"); + json_writer.push_path_segment("color"); + json_writer.set_text("red"); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Str, field=1, path=attributes.color, \"red\")" + ); + json_writer.set_text("blue"); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Str, field=1, path=attributes.color, \"blue\")" + ); + json_writer.pop_path_segment(); + json_writer.push_path_segment("dimensions"); + json_writer.push_path_segment("width"); + json_writer.set_i64(400); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=I64, field=1, path=attributes.dimensions.width, 400)" + ); + json_writer.pop_path_segment(); + json_writer.push_path_segment("height"); + json_writer.set_i64(300); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=I64, field=1, path=attributes.dimensions.height, 300)" + ); + } +} diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 83b2dd674..45fa3b488 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -46,7 +46,7 @@ impl TextOptions { /// Essentially, should we store the term frequency and/or the positions (See /// [`IndexRecordOption`](./enum.IndexRecordOption.html)). /// - the name of the `Tokenizer` that should be used to process the field. -#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] +#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)] pub struct TextFieldIndexing { record: IndexRecordOption, fieldnorms: bool, diff --git a/src/schema/value.rs b/src/schema/value.rs index df83930d6..495eab85c 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -27,6 +27,7 @@ pub enum Value { Facet(Facet), /// Arbitrarily sized byte array Bytes(Vec), + JsonObject(serde_json::Map), } impl Eq for Value {} @@ -43,6 +44,7 @@ impl Serialize for Value { Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()), Value::Facet(ref facet) => facet.serialize(serializer), Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes), + Value::JsonObject(ref obj) => obj.serialize(serializer), } } } @@ -248,6 +250,7 @@ mod binary_serialize { const DATE_CODE: u8 = 5; const F64_CODE: u8 = 6; const EXT_CODE: u8 = 7; + const JSON_OBJ_CODE: u8 = 8; // extended types @@ -296,8 +299,14 @@ mod binary_serialize { BYTES_CODE.serialize(writer)?; bytes.serialize(writer) } + Value::JsonObject(ref map) => { + JSON_OBJ_CODE.serialize(writer)?; + serde_json::to_writer(writer, &map)?; + Ok(()) + } } } + fn deserialize(reader: &mut R) -> io::Result { let type_code = u8::deserialize(reader)?; match type_code { @@ -347,6 +356,10 @@ mod binary_serialize { )), } } + JSON_OBJ_CODE => { + let map = serde_json::from_reader(reader)?; + Ok(Value::JsonObject(map)) + } _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code),