Added JSON type

JSON field
This commit is contained in:
Paul Masurel
2022-01-05 11:28:00 +09:00
parent 9815067171
commit 61e955039d
19 changed files with 615 additions and 132 deletions

View File

@@ -307,16 +307,16 @@ impl IndexMerger {
}
None => {}
},
FieldType::Str(_) => {
// We don't handle str fast field for the moment
// They can be implemented using what is done
// for facets in the future.
}
FieldType::Bytes(byte_options) => {
if byte_options.is_fast() {
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
}
}
FieldType::Str(_) | FieldType::JsonObject(_) => {
// We don't handle json / string fast field for the moment
// They can be implemented using what is done
// for facets in the future
}
}
}
Ok(())

View File

@@ -276,6 +276,9 @@ impl SegmentWriter {
postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context);
}
}
FieldType::JsonObject(_) => {
unimplemented!()
}
}
}
Ok(())

View File

@@ -0,0 +1,52 @@
use std::io;
use crate::Term;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{Recorder, NothingRecorder};
use crate::postings::stacker::Addr;
use crate::postings::{PostingsWriter, IndexingContext, UnorderedTermId, FieldSerializer};
pub struct JsonPostingsWriter {
str_posting_writer: Box<dyn PostingsWriter>,
non_str_posting_writer: Box<dyn PostingsWriter>,
}
impl JsonPostingsWriter {
pub(crate) fn new<R: Recorder>() -> Self {
JsonPostingsWriter {
str_posting_writer: SpecializedPostingsWriter::<R>::new_boxed(),
non_str_posting_writer: SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
}
}
}
impl PostingsWriter for JsonPostingsWriter {
fn subscribe(
&mut self,
doc: crate::DocId,
pos: u32,
term: &crate::Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId {
let term_type = term.typ();
todo!()
}
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
doc_id_map: Option<&DocIdMapping>,
indexing_context: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
todo!()
}
fn total_num_tokens(&self) -> u64 {
todo!()
}
}

View File

@@ -15,6 +15,7 @@ mod segment_postings;
mod serializer;
mod skip;
mod stacker;
mod json_postings_writer;
mod term_info;
pub use self::block_segment_postings::BlockSegmentPostings;

View File

@@ -1,3 +1,4 @@
use crate::postings::json_postings_writer::JsonPostingsWriter;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder};
use crate::postings::PostingsWriter;
@@ -49,5 +50,20 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn Postings
| FieldType::Date(_)
| FieldType::Bytes(_)
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
FieldType::JsonObject(ref json_object_options) => {
Box::new(if let Some(text_indexing_option) = json_object_options.get_text_indexing_option() {
match text_indexing_option.index_option() {
IndexRecordOption::Basic => JsonPostingsWriter::new::<NothingRecorder>(),
IndexRecordOption::WithFreqs => {
JsonPostingsWriter::new::<TermFrequencyRecorder>()
}
IndexRecordOption::WithFreqsAndPositions => {
JsonPostingsWriter::new::<TfAndPositionRecorder>()
}
}
} else {
JsonPostingsWriter::new::<NothingRecorder>()
})
},
}
}

View File

@@ -2,13 +2,12 @@ use std::collections::HashMap;
use std::io;
use std::marker::PhantomData;
use std::ops::Range;
use fnv::FnvHashMap;
use super::stacker::Addr;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::recorder::{BufferLender, Recorder};
use crate::postings::recorder::{BufferLender, Recorder, NothingRecorder};
use crate::postings::{
FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
UnorderedTermId,
@@ -85,6 +84,7 @@ pub(crate) fn serialize_postings(
}
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {}
FieldType::Bytes(_) => {}
FieldType::JsonObject(_) => {}
}
let postings_writer = per_field_postings_writers.get_for_field(field);
@@ -175,29 +175,121 @@ pub(crate) trait PostingsWriter {
fn total_num_tokens(&self) -> u64;
}
pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
text_postings_writer: SpecializedPostingsWriter<Rec>,
other_postings_writer: SpecializedPostingsWriter<NothingRecorder>,
}
impl<Rec: Recorder> JsonPostingsWriter<Rec> {
pub fn new_boxed() -> Box<dyn PostingsWriter> {
let text_postings_writer: SpecializedPostingsWriter<Rec> = SpecializedPostingsWriter {
total_num_tokens: 0u64,
_recorder_type: PhantomData,
};
let other_postings_writer: SpecializedPostingsWriter<NothingRecorder> =
SpecializedPostingsWriter {
total_num_tokens: 0u64,
_recorder_type: PhantomData,
};
Box::new(JsonPostingsWriter {
text_postings_writer,
other_postings_writer,
})
}
}
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
fn subscribe(
&mut self,
doc: DocId,
pos: u32,
term: &Term,
ctx: &mut IndexingContext
) -> UnorderedTermId {
// TODO will the unordered term id be correct!?
debug_assert!(term.is_json());
if term.typ() == Type::Str {
self.text_postings_writer
.subscribe(doc, pos, term, ctx)
} else {
self.other_postings_writer
.subscribe(doc, pos, term, ctx)
}
}
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
if term.typ() == Type::Str {
SpecializedPostingsWriter::<Rec>::serialize_one_term(
term,
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
serializer,
)?;
} else {
SpecializedPostingsWriter::<NothingRecorder>::serialize_one_term(
term,
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
serializer,
)?;
}
}
Ok(())
}
fn total_num_tokens(&self) -> u64 {
self.text_postings_writer.total_num_tokens() + self.other_postings_writer.total_num_tokens()
}
}
/// The `SpecializedPostingsWriter` is just here to remove dynamic
/// dispatch to the recorder information.
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
#[derive(Default)]
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
total_num_tokens: u64,
_recorder_type: PhantomData<Rec>,
}
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
/// constructor
pub fn new() -> SpecializedPostingsWriter<Rec> {
SpecializedPostingsWriter {
impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
pub fn new_boxed() -> Box<dyn PostingsWriter> {
let new_specialized_posting_writer: Self = Self {
total_num_tokens: 0u64,
_recorder_type: PhantomData,
}
};
Box::new(new_specialized_posting_writer)
}
/// Builds a `SpecializedPostingsWriter` storing its data in a memory arena.
pub fn new_boxed() -> Box<dyn PostingsWriter> {
Box::new(SpecializedPostingsWriter::<Rec>::new())
#[inline]
fn serialize_one_term(
term: &Term<&[u8]>,
addr: Addr,
doc_id_map: Option<&DocIdMapping>,
buffer_lender: &mut BufferLender,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let recorder: Rec = ctx.arena.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
serializer.new_term(term.value_bytes(), term_doc_freq)?;
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
serializer.close_term()?;
Ok(())
}
}
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
fn subscribe(
&mut self,
doc: DocId,
@@ -233,21 +325,19 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
doc_id_map: Option<&DocIdMapping>,
indexing_context: &IndexingContext,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
let recorder: Rec = indexing_context.term_index.read(*addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
serializer.new_term(term.value_bytes(), term_doc_freq)?;
recorder.serialize(
&indexing_context.arena,
Self::serialize_one_term(
term,
*addr,
doc_id_map,
serializer,
&mut buffer_lender,
);
serializer.close_term()?;
ctx,
serializer,
)?;
}
Ok(())
}

View File

@@ -76,7 +76,7 @@ impl InvertedIndexSerializer {
field: Field,
total_num_tokens: u64,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'_>> {
) -> io::Result<FieldSerializer> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
@@ -203,6 +203,7 @@ impl<'a> FieldSerializer<'a> {
self.current_term_info.doc_freq += 1;
self.postings_serializer.write_doc(doc_id, term_freq);
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
assert_eq!(term_freq as usize, position_deltas.len());
positions_serializer.write_positions_delta(position_deltas);
}
}

View File

@@ -372,6 +372,9 @@ impl QueryParser {
let term = Term::from_field_bytes(field, &bytes);
Ok(vec![(0, term)])
}
FieldType::JsonObject(_) => {
unimplemented!()
}
}
}
@@ -661,7 +664,7 @@ mod test {
let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap();
assert_eq!(
format!("{:?}", query),
r#"TermQuery(Term(type=Facet, field=11, val="/root/branch/leaf"))"#
r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"#
);
}
@@ -674,7 +677,7 @@ mod test {
let query = query_parser.parse_query("text:hello").unwrap();
assert_eq!(
format!("{:?}", query),
r#"Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2)"#
r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"#
);
}
@@ -701,7 +704,7 @@ mod test {
let query = query_parser.parse_query("text:hello^2").unwrap();
assert_eq!(
format!("{:?}", query),
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2), boost=2)"#
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"#
);
}
@@ -736,7 +739,7 @@ mod test {
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper(
"nottokenized:\"wordone wordtwo\"",
r#"Term(type=Str, field=7, val="wordone wordtwo")"#,
r#"Term(type=Str, field=7, "wordone wordtwo")"#,
false,
);
}
@@ -779,7 +782,7 @@ mod test {
.is_ok());
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term(type=U64, field=3, val=2324)",
"Term(type=U64, field=3, 2324)",
false,
);
@@ -806,7 +809,7 @@ mod test {
fn test_parse_bytes() {
test_parse_query_to_logical_ast_helper(
"bytes:YnVidQ==",
"Term(type=Bytes, field=12, val=[98, 117, 98, 117])",
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
false,
);
}
@@ -821,7 +824,7 @@ mod test {
fn test_parse_bytes_phrase() {
test_parse_query_to_logical_ast_helper(
"bytes:\"YnVidQ==\"",
"Term(type=Bytes, field=12, val=[98, 117, 98, 117])",
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
false,
);
}
@@ -837,12 +840,12 @@ mod test {
fn test_parse_query_to_ast_ab_c() {
test_parse_query_to_logical_ast_helper(
"(+title:a +title:b) title:c",
r#"((+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) Term(type=Str, field=0, val="c"))"#,
r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"(+title:a +title:b) title:c",
r#"(+(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) +Term(type=Str, field=0, val="c"))"#,
r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#,
true,
);
}
@@ -851,17 +854,17 @@ mod test {
pub fn test_parse_query_to_ast_single_term() {
test_parse_query_to_logical_ast_helper(
"title:toto",
r#"Term(type=Str, field=0, val="toto")"#,
r#"Term(type=Str, field=0, "toto")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
r#"Term(type=Str, field=0, val="toto")"#,
r#"Term(type=Str, field=0, "toto")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#,
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
false,
);
}
@@ -878,12 +881,12 @@ mod test {
pub fn test_parse_query_to_ast_two_terms() {
test_parse_query_to_logical_ast_helper(
"title:a b",
r#"(Term(type=Str, field=0, val="a") (Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#,
r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
r#"title:"a b""#,
r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#,
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
false,
);
}
@@ -892,37 +895,37 @@ mod test {
pub fn test_parse_query_to_ast_ranges() {
test_parse_query_to_logical_ast_helper(
"title:[a TO b]",
r#"(Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b")))"#,
r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"[a TO b]",
r#"((Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b"))) (Included(Term(type=Str, field=1, val="a")) TO Included(Term(type=Str, field=1, val="b"))))"#,
r#"((Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b"))) (Included(Term(type=Str, field=1, "a")) TO Included(Term(type=Str, field=1, "b"))))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO toto}",
r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Excluded(Term(type=Str, field=0, val="toto")))"#,
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{* TO toto}",
r#"(Unbounded TO Excluded(Term(type=Str, field=0, val="toto")))"#,
r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO *}",
r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Unbounded)"#,
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#,
false,
);
test_parse_query_to_logical_ast_helper(
"signed:{-5 TO 3}",
r#"(Excluded(Term(type=I64, field=2, val=-5)) TO Excluded(Term(type=I64, field=2, val=3)))"#,
r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"float:{-1.5 TO 1.5}",
r#"(Excluded(Term(type=F64, field=10, val=-1.5)) TO Excluded(Term(type=F64, field=10, val=1.5)))"#,
r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#,
false,
);
test_parse_query_to_logical_ast_helper("*", "*", false);
@@ -1051,27 +1054,27 @@ mod test {
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper(
"title:toto",
r#"Term(type=Str, field=0, val="toto")"#,
r#"Term(type=Str, field=0, "toto")"#,
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
r#"Term(type=Str, field=0, val="toto")"#,
r#"Term(type=Str, field=0, "toto")"#,
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#,
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"title:a b",
r#"(+Term(type=Str, field=0, val="a") +(Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#,
r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#,
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
true,
);
}
@@ -1080,7 +1083,7 @@ mod test {
pub fn test_query_parser_hyphen() {
test_parse_query_to_logical_ast_helper(
"title:www-form-encoded",
r#""[(0, Term(type=Str, field=0, val="www")), (1, Term(type=Str, field=0, val="form")), (2, Term(type=Str, field=0, val="encoded"))]""#,
r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#,
false,
);
}
@@ -1090,7 +1093,7 @@ mod test {
for &default_conjunction in &[false, true] {
test_parse_query_to_logical_ast_helper(
"title:a AND title:b",
r#"(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b"))"#,
r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#,
default_conjunction,
);
}
@@ -1101,7 +1104,7 @@ mod test {
for &default_conjunction in &[false, true] {
test_parse_query_to_logical_ast_helper(
"title:a OR title:b",
r#"(Term(type=Str, field=0, val="a") Term(type=Str, field=0, val="b"))"#,
r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#,
default_conjunction,
);
}

View File

@@ -174,7 +174,7 @@ mod tests {
);
assert_eq!(
format!("{:?}", term_query),
r#"TermQuery(Term(type=Str, field=1, val="hello"))"#
r#"TermQuery(Term(type=Str, field=1, "hello"))"#
);
}

View File

@@ -137,6 +137,7 @@ impl FieldEntry {
FieldType::Str(ref options) => options.is_stored(),
FieldType::Facet(ref options) => options.is_stored(),
FieldType::Bytes(ref options) => options.is_stored(),
FieldType::JsonObject(ref options) => options.is_stored(),
}
}
}

View File

@@ -1,25 +1,33 @@
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::Facet;
use crate::schema::IndexRecordOption;
use crate::schema::JsonObjectOptions;
use crate::schema::TextFieldIndexing;
use crate::schema::Value;
use crate::schema::{IntOptions, TextOptions};
use crate::tokenizer::PreTokenizedString;
use chrono::{FixedOffset, Utc};
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::{Facet, IndexRecordOption, IntOptions, TextFieldIndexing, TextOptions, Value};
use crate::tokenizer::PreTokenizedString;
use thiserror::Error;
/// Possible error that may occur while parsing a field value
/// At this point the JSON is known to be valid.
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Error)]
pub enum ValueParsingError {
/// Encountered a numerical value that overflows or underflow its integer type.
OverflowError(String),
/// The json node is not of the correct type.
/// (e.g. 3 for a `Str` type or `"abc"` for a u64 type)
/// Tantivy will try to autocast values.
TypeError(String),
/// The json node is a string but contains json that is
/// not valid base64.
InvalidBase64(String),
#[error("Overflow error. Expected {expected}, got {json}")]
OverflowError {
expected: &'static str,
json: serde_json::Value,
},
#[error("Type error. Expected {expected}, got {json}")]
TypeError {
expected: &'static str,
json: serde_json::Value,
},
#[error("Invalid base64: {base64}")]
InvalidBase64 { base64: String },
}
/// Type of the value that a field can take.
@@ -67,6 +75,18 @@ impl Type {
*self as u8
}
pub fn name(&self) -> &'static str {
match self {
Type::Str => "Str",
Type::U64 => "U64",
Type::I64 => "I64",
Type::F64 => "F64",
Type::Date => "Date",
Type::Facet => "Facet",
Type::Bytes => "Bytes",
}
}
/// Interprets a 1byte code as a type.
/// Returns None if the code is invalid.
pub fn from_code(code: u8) -> Option<Self> {
@@ -104,6 +124,8 @@ pub enum FieldType {
Facet(FacetOptions),
/// Bytes (one per document)
Bytes(BytesOptions),
/// Json object
JsonObject(JsonObjectOptions),
}
impl FieldType {
@@ -117,6 +139,9 @@ impl FieldType {
FieldType::Date(_) => Type::Date,
FieldType::Facet(_) => Type::Facet,
FieldType::Bytes(_) => Type::Bytes,
FieldType::JsonObject(_) => {
unimplemented!()
}
}
}
@@ -130,6 +155,7 @@ impl FieldType {
FieldType::Date(ref date_options) => date_options.is_indexed(),
FieldType::Facet(ref _facet_options) => true,
FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(),
FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(),
}
}
@@ -146,12 +172,17 @@ impl FieldType {
| FieldType::Date(ref int_options) => int_options.fieldnorms(),
FieldType::Facet(_) => false,
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
FieldType::JsonObject(ref json_object_options) => false,
}
}
/// Given a field configuration, return the maximal possible
/// `IndexRecordOption` available.
///
/// For the Json object, this does not necessarily mean it is the index record
/// option level is available for all terms.
/// (Non string terms have the Basic indexing option at most.)
///
/// If the field is not indexed, then returns `None`.
pub fn get_index_record_option(&self) -> Option<IndexRecordOption> {
match *self {
@@ -176,6 +207,10 @@ impl FieldType {
None
}
}
FieldType::JsonObject(ref json_obj_options) => json_obj_options
.indexing
.as_ref()
.map(TextFieldIndexing::index_option),
}
}
@@ -189,25 +224,30 @@ impl FieldType {
JsonValue::String(ref field_text) => match *self {
FieldType::Date(_) => {
let dt_with_fixed_tz: chrono::DateTime<FixedOffset> =
chrono::DateTime::parse_from_rfc3339(field_text).map_err(|err| {
ValueParsingError::TypeError(format!(
"Failed to parse date from JSON. Expected rfc3339 format, got {}. \
{:?}",
field_text, err
))
chrono::DateTime::parse_from_rfc3339(field_text).map_err(|_err| {
ValueParsingError::TypeError {
expected: "rfc3339 format",
json: json.clone(),
}
})?;
Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc)))
}
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err(
ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)),
),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
Err(ValueParsingError::TypeError {
expected: "an integer",
json: json.clone(),
})
}
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))),
FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| {
ValueParsingError::InvalidBase64(format!(
"Expected base64 string, got {:?}",
field_text
))
ValueParsingError::InvalidBase64 {
base64: field_text.clone(),
}
}),
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
expected: "a json object",
json: json.clone(),
}),
},
JsonValue::Number(ref field_val_num) => match *self {
@@ -215,30 +255,42 @@ impl FieldType {
if let Some(field_val_i64) = field_val_num.as_i64() {
Ok(Value::I64(field_val_i64))
} else {
let msg = format!("Expected an i64 int, got {:?}", json);
Err(ValueParsingError::OverflowError(msg))
Err(ValueParsingError::OverflowError {
expected: "an i64 int",
json: json.clone(),
})
}
}
FieldType::U64(_) => {
if let Some(field_val_u64) = field_val_num.as_u64() {
Ok(Value::U64(field_val_u64))
} else {
let msg = format!("Expected a u64 int, got {:?}", json);
Err(ValueParsingError::OverflowError(msg))
Err(ValueParsingError::OverflowError {
expected: "u64",
json: json.clone(),
})
}
}
FieldType::F64(_) => {
if let Some(field_val_f64) = field_val_num.as_f64() {
Ok(Value::F64(field_val_f64))
} else {
let msg = format!("Expected a f64 int, got {:?}", json);
Err(ValueParsingError::OverflowError(msg))
Err(ValueParsingError::OverflowError {
expected: "a f64",
json: json.clone(),
})
}
}
FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => {
let msg = format!("Expected a string, got {:?}", json);
Err(ValueParsingError::TypeError(msg))
Err(ValueParsingError::TypeError {
expected: "a string",
json: json.clone(),
})
}
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
expected: "a json object",
json: json.clone(),
}),
},
JsonValue::Object(_) => match *self {
FieldType::Str(_) => {
@@ -247,28 +299,21 @@ impl FieldType {
{
Ok(Value::PreTokStr(tok_str_val))
} else {
let msg = format!(
"Json value {:?} cannot be translated to PreTokenizedString.",
json
);
Err(ValueParsingError::TypeError(msg))
Err(ValueParsingError::TypeError {
expected: "a string or an pretokenized string",
json: json.clone(),
})
}
}
_ => {
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
json, self
);
Err(ValueParsingError::TypeError(msg))
}
_ => Err(ValueParsingError::TypeError {
expected: self.value_type().name(),
json: json.clone(),
}),
},
_ => {
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
json, self
);
Err(ValueParsingError::TypeError(msg))
}
_ => Err(ValueParsingError::TypeError {
expected: self.value_type().name(),
json: json.clone(),
}),
}
}
}
@@ -317,13 +362,13 @@ mod tests {
let result = FieldType::Bytes(Default::default()).value_from_json(&json!(521));
match result {
Err(ValueParsingError::TypeError(_)) => {}
Err(ValueParsingError::TypeError { .. }) => {}
_ => panic!("Expected parse failure for wrong type"),
}
let result = FieldType::Bytes(Default::default()).value_from_json(&json!("-"));
match result {
Err(ValueParsingError::InvalidBase64(_)) => {}
Err(ValueParsingError::InvalidBase64 { .. }) => {}
_ => panic!("Expected parse failure for invalid base64"),
}
}

View File

@@ -0,0 +1,25 @@
use serde::{Deserialize, Serialize};
use crate::schema::TextFieldIndexing;
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct JsonObjectOptions {
stored: bool,
// If set to some, int, date, f64 and text will be indexed.
// Text will use the TextFieldIndexing setting for indexing.
pub(crate) indexing: Option<TextFieldIndexing>,
}
impl JsonObjectOptions {
pub fn is_stored(&self) -> bool {
self.stored
}
pub fn is_indexed(&self) -> bool {
self.indexing.is_some()
}
pub fn get_text_indexing_option(&self) -> Option<&TextFieldIndexing> {
self.indexing.as_ref()
}
}

View File

@@ -0,0 +1,25 @@
use serde_json::Number;
use crate::schema::Value;
pub fn infer_type_from_string(string: &str) -> Value {
// TODO can we avoid the copy?
Value::Str(string.to_string())
}
pub fn infer_type_from_number(number: &Number) -> Value {
if let Some(num_val) = number.as_u64() {
return Value::U64(num_val);
}
if let Some(num_val) = number.as_i64() {
return Value::I64(num_val);
}
if let Some(num_val) = number.as_f64() {
return Value::F64(num_val);
}
// This should never happen as long as we
// don't use the serde_json feature = "arbitrary_precision".
Value::Str(number.to_string())
}
// TODO add unit tests

View File

@@ -104,7 +104,7 @@ mod document;
mod facet;
mod facet_options;
mod schema;
mod term;
pub(crate) mod term;
mod field_entry;
mod field_type;
@@ -114,11 +114,14 @@ mod bytes_options;
mod field;
mod index_record_option;
mod int_options;
mod json_object_options;
mod named_field_document;
mod text_options;
mod value;
mod flags;
mod json_object_utils;
mod term_writer;
pub use self::bytes_options::BytesOptions;
pub use self::document::Document;
@@ -138,6 +141,8 @@ pub use self::term::Term;
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
pub use self::value::Value;
pub use self::json_object_options::JsonObjectOptions;
/// Validator for a potential `field_name`.
/// Returns true if the name can be use for a field name.
///

View File

@@ -666,7 +666,7 @@ mod tests {
json_err,
Err(DocParsingError::ValueError(
_,
ValueParsingError::TypeError(_)
ValueParsingError::TypeError { .. }
))
);
}
@@ -684,7 +684,7 @@ mod tests {
json_err,
Err(DocParsingError::ValueError(
_,
ValueParsingError::OverflowError(_)
ValueParsingError::OverflowError { .. }
))
);
}
@@ -702,7 +702,7 @@ mod tests {
json_err,
Err(DocParsingError::ValueError(
_,
ValueParsingError::OverflowError(_)
ValueParsingError::OverflowError { .. }
))
));
}
@@ -720,7 +720,7 @@ mod tests {
json_err,
Err(DocParsingError::ValueError(
_,
ValueParsingError::OverflowError(_)
ValueParsingError::OverflowError { .. }
))
);
}

View File

@@ -8,8 +8,27 @@ use crate::DateTime;
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
/// <field> + <type byte> + <value len>
///
/// - <field> is a big endian encoded u32 field id
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not
/// The remaining 7 bits are used to encode the type of the value.
/// If this is a JSON term, the type is the type of the leaf of the json.
///
/// - <value> is, if this is not the json term, a binary representation specific to the type.
/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value.
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
/// The first bit of the `type_byte`
/// is used to encode whether a Term is for
/// json or not.
pub const JSON_MARKER_BIT: u8 = 128u8;
/// Separates the different segments of
/// the json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 0u8;
/// Separates the json path and the value in
/// a JSON term binary representation.
pub const JSON_END_OF_PATH: u8 = 30u8;
/// Term represents the value that the token can take.
///
/// It actually wraps a `Vec<u8>`.
@@ -164,13 +183,23 @@ where B: AsRef<[u8]>
Term(data)
}
/// Return the type of the term.
pub fn typ(&self) -> Type {
/// Returns true iff the term is a JSON term.
pub fn is_json(&self) -> bool {
self.typ_code() & JSON_MARKER_BIT == JSON_MARKER_BIT
}
fn typ_code(&self) -> u8 {
assert!(
self.as_slice().len() >= 5,
"the type does byte representation is too short"
"the byte representation is too short"
);
Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code")
self.as_slice()[4]
}
/// Return the type of the term.
pub fn typ(&self) -> Type {
Type::from_code(self.typ_code() & (JSON_MARKER_BIT - 1))
.expect("The term has an invalid type code")
}
/// Returns the field.
@@ -189,11 +218,15 @@ where B: AsRef<[u8]>
}
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN {
if self.typ() != T::to_type() {
return None;
}
let mut value_bytes = [0u8; 8];
value_bytes.copy_from_slice(self.value_bytes());
let bytes = self.value_bytes_without_path();
if bytes.len() != 8 {
return None;
}
value_bytes.copy_from_slice(self.value_bytes_without_path());
let value_u64 = u64::from_be_bytes(value_bytes);
Some(FastValue::from_u64(value_u64))
}
@@ -206,6 +239,35 @@ where B: AsRef<[u8]>
self.get_fast_type::<i64>()
}
fn json_path_value(&self) -> Option<&str> {
if !self.is_json() {
return None;
}
let value_bytes = self.value_bytes();
let pos = value_bytes
.iter()
.cloned()
.position(|b| b == JSON_END_OF_PATH)
.expect("Could not find end of path");
let json_path =
str::from_utf8(&value_bytes[..pos]).expect("JSON value path is Invalid utf-8");
Some(json_path)
}
fn value_bytes_without_path(&self) -> &[u8] {
let value_bytes = self.value_bytes();
if self.is_json() {
let pos = value_bytes
.iter()
.cloned()
.position(|b| b == JSON_END_OF_PATH)
.expect("Could not find end of path");
&value_bytes[pos + 1..]
} else {
value_bytes
}
}
/// Returns the `f64` value stored in a term.
///
/// Returns None if the term is not of the f64 type, or if the term byte representation
@@ -233,7 +295,7 @@ where B: AsRef<[u8]>
if self.typ() != Type::Str {
return None;
}
str::from_utf8(self.value_bytes()).ok()
str::from_utf8(self.value_bytes_without_path()).ok()
}
/// Returns the facet associated with the term.
@@ -247,7 +309,7 @@ where B: AsRef<[u8]>
if self.typ() != Type::Facet {
return None;
}
let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?;
let facet_encode_str = str::from_utf8(self.value_bytes_without_path()).ok()?;
Some(Facet::from_encoded_string(facet_encode_str.to_string()))
}
@@ -261,7 +323,7 @@ where B: AsRef<[u8]>
if self.typ() != Type::Bytes {
return None;
}
Some(self.value_bytes())
Some(self.value_bytes_without_path())
}
/// Returns the serialized value of the term.
@@ -290,14 +352,24 @@ fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) ->
Ok(())
}
impl fmt::Debug for Term {
impl<B> fmt::Debug for Term<B>
where
B: AsRef<[u8]>,
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let field_id = self.field().field_id();
let typ = self.typ();
write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?;
write!(f, "Term(type={:?}, field={}, ", typ, field_id)?;
if let Some(path) = self.json_path_value() {
write!(
f,
"path={}, ",
path.replace(std::str::from_utf8(&[JSON_PATH_SEGMENT_SEP]).unwrap(), ".")
)?;
}
match typ {
Type::Str => {
let s = str::from_utf8(self.value_bytes()).ok();
let s = self.as_str();
write_opt(f, s)?;
}
Type::U64 => {

131
src/schema/term_writer.rs Normal file
View File

@@ -0,0 +1,131 @@
use crate::fastfield::FastValue;
use crate::schema::{Field, Type};
use crate::Term;
// Copyright (C) 2021 Quickwit, Inc.
//
// Quickwit is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at hello@quickwit.io.
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
use super::term::JSON_MARKER_BIT;
pub struct JsonTermWriter {
buffer: Vec<u8>,
path_stack: Vec<usize>,
}
impl JsonTermWriter {
pub fn new() -> Self {
let mut buffer = Vec::with_capacity(100);
buffer.resize(5, 0);
let mut path_stack = Vec::with_capacity(10);
path_stack.push(5);
Self { buffer, path_stack }
}
pub fn with_field(field: Field) -> Self {
let mut json_term_writer: JsonTermWriter = Self::new();
json_term_writer.set_field(field);
json_term_writer
}
fn trim_to_end_of_path(&mut self) {
let end_of_path = *self.path_stack.last().unwrap();
self.buffer.resize(end_of_path, 0u8);
self.buffer[end_of_path - 1] = super::term::JSON_PATH_SEGMENT_SEP;
}
fn close_path_and_set_type(&mut self, typ: Type) {
self.trim_to_end_of_path();
self.buffer[4] = JSON_MARKER_BIT | typ.to_code();
let end_of_path = self.buffer.len();
self.buffer[end_of_path - 1] = super::term::JSON_END_OF_PATH;
}
pub fn push_path_segment(&mut self, segment: &str) {
// the path stack should never be empty.
self.trim_to_end_of_path();
self.buffer.extend_from_slice(segment.as_bytes());
self.buffer.push(super::term::JSON_PATH_SEGMENT_SEP);
self.path_stack.push(self.buffer.len());
}
pub fn pop_path_segment(&mut self) {
self.path_stack.pop();
assert!(self.path_stack.len() > 0);
self.trim_to_end_of_path();
}
pub fn set_text(&mut self, text: &str) {
self.close_path_and_set_type(Type::Str);
self.buffer.extend_from_slice(text.as_bytes());
}
pub fn set_i64(&mut self, val: i64) {
self.close_path_and_set_type(Type::I64);
self.buffer
.extend_from_slice(val.to_u64().to_be_bytes().as_slice());
}
pub fn set_field(&mut self, field: Field) {
self.buffer[0..4].copy_from_slice(field.field_id().to_be_bytes().as_ref());
}
pub fn term(&self) -> Term<&[u8]> {
Term::wrap(&self.buffer[..])
}
}
#[cfg(test)]
mod tests {
use crate::schema::Field;
use super::JsonTermWriter;
#[test]
fn test_json_writer() {
let field = Field::from_field_id(1);
let mut json_writer = JsonTermWriter::with_field(field);
json_writer.push_path_segment("attributes");
json_writer.push_path_segment("color");
json_writer.set_text("red");
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Str, field=1, path=attributes.color, \"red\")"
);
json_writer.set_text("blue");
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Str, field=1, path=attributes.color, \"blue\")"
);
json_writer.pop_path_segment();
json_writer.push_path_segment("dimensions");
json_writer.push_path_segment("width");
json_writer.set_i64(400);
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=I64, field=1, path=attributes.dimensions.width, 400)"
);
json_writer.pop_path_segment();
json_writer.push_path_segment("height");
json_writer.set_i64(300);
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=I64, field=1, path=attributes.dimensions.height, 300)"
);
}
}

View File

@@ -46,7 +46,7 @@ impl TextOptions {
/// Essentially, should we store the term frequency and/or the positions (See
/// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
/// - the name of the `Tokenizer` that should be used to process the field.
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
pub struct TextFieldIndexing {
record: IndexRecordOption,
fieldnorms: bool,

View File

@@ -27,6 +27,7 @@ pub enum Value {
Facet(Facet),
/// Arbitrarily sized byte array
Bytes(Vec<u8>),
JsonObject(serde_json::Map<String, serde_json::Value>),
}
impl Eq for Value {}
@@ -43,6 +44,7 @@ impl Serialize for Value {
Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()),
Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
Value::JsonObject(ref obj) => obj.serialize(serializer),
}
}
}
@@ -248,6 +250,7 @@ mod binary_serialize {
const DATE_CODE: u8 = 5;
const F64_CODE: u8 = 6;
const EXT_CODE: u8 = 7;
const JSON_OBJ_CODE: u8 = 8;
// extended types
@@ -296,8 +299,14 @@ mod binary_serialize {
BYTES_CODE.serialize(writer)?;
bytes.serialize(writer)
}
Value::JsonObject(ref map) => {
JSON_OBJ_CODE.serialize(writer)?;
serde_json::to_writer(writer, &map)?;
Ok(())
}
}
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let type_code = u8::deserialize(reader)?;
match type_code {
@@ -347,6 +356,10 @@ mod binary_serialize {
)),
}
}
JSON_OBJ_CODE => {
let map = serde_json::from_reader(reader)?;
Ok(Value::JsonObject(map))
}
_ => Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("No field type is associated with code {:?}", type_code),