mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Added JSON type
JSON field
This commit is contained in:
@@ -307,16 +307,16 @@ impl IndexMerger {
|
||||
}
|
||||
None => {}
|
||||
},
|
||||
FieldType::Str(_) => {
|
||||
// We don't handle str fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future.
|
||||
}
|
||||
FieldType::Bytes(byte_options) => {
|
||||
if byte_options.is_fast() {
|
||||
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::JsonObject(_) => {
|
||||
// We don't handle json / string fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -276,6 +276,9 @@ impl SegmentWriter {
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context);
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(_) => {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
52
src/postings/json_postings_writer.rs
Normal file
52
src/postings/json_postings_writer.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
use std::io;
|
||||
use crate::Term;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{Recorder, NothingRecorder};
|
||||
use crate::postings::stacker::Addr;
|
||||
use crate::postings::{PostingsWriter, IndexingContext, UnorderedTermId, FieldSerializer};
|
||||
|
||||
pub struct JsonPostingsWriter {
|
||||
str_posting_writer: Box<dyn PostingsWriter>,
|
||||
non_str_posting_writer: Box<dyn PostingsWriter>,
|
||||
}
|
||||
|
||||
impl JsonPostingsWriter {
|
||||
pub(crate) fn new<R: Recorder>() -> Self {
|
||||
JsonPostingsWriter {
|
||||
str_posting_writer: SpecializedPostingsWriter::<R>::new_boxed(),
|
||||
non_str_posting_writer: SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl PostingsWriter for JsonPostingsWriter {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
pos: u32,
|
||||
term: &crate::Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId {
|
||||
let term_type = term.typ();
|
||||
todo!()
|
||||
}
|
||||
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
indexing_context: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ mod segment_postings;
|
||||
mod serializer;
|
||||
mod skip;
|
||||
mod stacker;
|
||||
mod json_postings_writer;
|
||||
mod term_info;
|
||||
|
||||
pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::postings::json_postings_writer::JsonPostingsWriter;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder};
|
||||
use crate::postings::PostingsWriter;
|
||||
@@ -49,5 +50,20 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn Postings
|
||||
| FieldType::Date(_)
|
||||
| FieldType::Bytes(_)
|
||||
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
FieldType::JsonObject(ref json_object_options) => {
|
||||
Box::new(if let Some(text_indexing_option) = json_object_options.get_text_indexing_option() {
|
||||
match text_indexing_option.index_option() {
|
||||
IndexRecordOption::Basic => JsonPostingsWriter::new::<NothingRecorder>(),
|
||||
IndexRecordOption::WithFreqs => {
|
||||
JsonPostingsWriter::new::<TermFrequencyRecorder>()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
JsonPostingsWriter::new::<TfAndPositionRecorder>()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
JsonPostingsWriter::new::<NothingRecorder>()
|
||||
})
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,13 +2,12 @@ use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Range;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
|
||||
use super::stacker::Addr;
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::recorder::{BufferLender, Recorder};
|
||||
use crate::postings::recorder::{BufferLender, Recorder, NothingRecorder};
|
||||
use crate::postings::{
|
||||
FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
|
||||
UnorderedTermId,
|
||||
@@ -85,6 +84,7 @@ pub(crate) fn serialize_postings(
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {}
|
||||
FieldType::Bytes(_) => {}
|
||||
FieldType::JsonObject(_) => {}
|
||||
}
|
||||
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
@@ -175,29 +175,121 @@ pub(crate) trait PostingsWriter {
|
||||
fn total_num_tokens(&self) -> u64;
|
||||
}
|
||||
|
||||
pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
|
||||
text_postings_writer: SpecializedPostingsWriter<Rec>,
|
||||
other_postings_writer: SpecializedPostingsWriter<NothingRecorder>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> JsonPostingsWriter<Rec> {
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
let text_postings_writer: SpecializedPostingsWriter<Rec> = SpecializedPostingsWriter {
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
};
|
||||
let other_postings_writer: SpecializedPostingsWriter<NothingRecorder> =
|
||||
SpecializedPostingsWriter {
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
};
|
||||
Box::new(JsonPostingsWriter {
|
||||
text_postings_writer,
|
||||
other_postings_writer,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
ctx: &mut IndexingContext
|
||||
) -> UnorderedTermId {
|
||||
// TODO will the unordered term id be correct!?
|
||||
debug_assert!(term.is_json());
|
||||
if term.typ() == Type::Str {
|
||||
self.text_postings_writer
|
||||
.subscribe(doc, pos, term, ctx)
|
||||
} else {
|
||||
self.other_postings_writer
|
||||
.subscribe(doc, pos, term, ctx)
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
if term.typ() == Type::Str {
|
||||
SpecializedPostingsWriter::<Rec>::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
} else {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64 {
|
||||
self.text_postings_writer.total_num_tokens() + self.other_postings_writer.total_num_tokens()
|
||||
}
|
||||
}
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
#[derive(Default)]
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
/// constructor
|
||||
pub fn new() -> SpecializedPostingsWriter<Rec> {
|
||||
SpecializedPostingsWriter {
|
||||
impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
let new_specialized_posting_writer: Self = Self {
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
};
|
||||
Box::new(new_specialized_posting_writer)
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a memory arena.
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
#[inline]
|
||||
fn serialize_one_term(
|
||||
term: &Term<&[u8]>,
|
||||
addr: Addr,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let recorder: Rec = ctx.arena.read(addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
|
||||
serializer.close_term()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
@@ -233,21 +325,19 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
indexing_context: &IndexingContext,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
let recorder: Rec = indexing_context.term_index.read(*addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(
|
||||
&indexing_context.arena,
|
||||
Self::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
serializer,
|
||||
&mut buffer_lender,
|
||||
);
|
||||
serializer.close_term()?;
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -76,7 +76,7 @@ impl InvertedIndexSerializer {
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> io::Result<FieldSerializer<'_>> {
|
||||
) -> io::Result<FieldSerializer> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -203,6 +203,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.current_term_info.doc_freq += 1;
|
||||
self.postings_serializer.write_doc(doc_id, term_freq);
|
||||
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
assert_eq!(term_freq as usize, position_deltas.len());
|
||||
positions_serializer.write_positions_delta(position_deltas);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -372,6 +372,9 @@ impl QueryParser {
|
||||
let term = Term::from_field_bytes(field, &bytes);
|
||||
Ok(vec![(0, term)])
|
||||
}
|
||||
FieldType::JsonObject(_) => {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -661,7 +664,7 @@ mod test {
|
||||
let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
r#"TermQuery(Term(type=Facet, field=11, val="/root/branch/leaf"))"#
|
||||
r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -674,7 +677,7 @@ mod test {
|
||||
let query = query_parser.parse_query("text:hello").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
r#"Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2)"#
|
||||
r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -701,7 +704,7 @@ mod test {
|
||||
let query = query_parser.parse_query("text:hello^2").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2), boost=2)"#
|
||||
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -736,7 +739,7 @@ mod test {
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"nottokenized:\"wordone wordtwo\"",
|
||||
r#"Term(type=Str, field=7, val="wordone wordtwo")"#,
|
||||
r#"Term(type=Str, field=7, "wordone wordtwo")"#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -779,7 +782,7 @@ mod test {
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"unsigned:2324",
|
||||
"Term(type=U64, field=3, val=2324)",
|
||||
"Term(type=U64, field=3, 2324)",
|
||||
false,
|
||||
);
|
||||
|
||||
@@ -806,7 +809,7 @@ mod test {
|
||||
fn test_parse_bytes() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"bytes:YnVidQ==",
|
||||
"Term(type=Bytes, field=12, val=[98, 117, 98, 117])",
|
||||
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -821,7 +824,7 @@ mod test {
|
||||
fn test_parse_bytes_phrase() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"bytes:\"YnVidQ==\"",
|
||||
"Term(type=Bytes, field=12, val=[98, 117, 98, 117])",
|
||||
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -837,12 +840,12 @@ mod test {
|
||||
fn test_parse_query_to_ast_ab_c() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"(+title:a +title:b) title:c",
|
||||
r#"((+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) Term(type=Str, field=0, val="c"))"#,
|
||||
r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"(+title:a +title:b) title:c",
|
||||
r#"(+(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) +Term(type=Str, field=0, val="c"))"#,
|
||||
r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#,
|
||||
true,
|
||||
);
|
||||
}
|
||||
@@ -851,17 +854,17 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_single_term() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
r#"Term(type=Str, field=0, "toto")"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
r#"Term(type=Str, field=0, "toto")"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#,
|
||||
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -878,12 +881,12 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_two_terms() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
r#"(Term(type=Str, field=0, val="a") (Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#,
|
||||
r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
r#"title:"a b""#,
|
||||
r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#,
|
||||
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -892,37 +895,37 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_ranges() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:[a TO b]",
|
||||
r#"(Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b")))"#,
|
||||
r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"[a TO b]",
|
||||
r#"((Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b"))) (Included(Term(type=Str, field=1, val="a")) TO Included(Term(type=Str, field=1, val="b"))))"#,
|
||||
r#"((Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b"))) (Included(Term(type=Str, field=1, "a")) TO Included(Term(type=Str, field=1, "b"))))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO toto}",
|
||||
r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Excluded(Term(type=Str, field=0, val="toto")))"#,
|
||||
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{* TO toto}",
|
||||
r#"(Unbounded TO Excluded(Term(type=Str, field=0, val="toto")))"#,
|
||||
r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO *}",
|
||||
r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Unbounded)"#,
|
||||
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:{-5 TO 3}",
|
||||
r#"(Excluded(Term(type=I64, field=2, val=-5)) TO Excluded(Term(type=I64, field=2, val=3)))"#,
|
||||
r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"float:{-1.5 TO 1.5}",
|
||||
r#"(Excluded(Term(type=F64, field=10, val=-1.5)) TO Excluded(Term(type=F64, field=10, val=1.5)))"#,
|
||||
r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper("*", "*", false);
|
||||
@@ -1051,27 +1054,27 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
r#"Term(type=Str, field=0, "toto")"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
r#"Term(type=Str, field=0, "toto")"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#,
|
||||
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
r#"(+Term(type=Str, field=0, val="a") +(Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#,
|
||||
r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#,
|
||||
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
|
||||
true,
|
||||
);
|
||||
}
|
||||
@@ -1080,7 +1083,7 @@ mod test {
|
||||
pub fn test_query_parser_hyphen() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:www-form-encoded",
|
||||
r#""[(0, Term(type=Str, field=0, val="www")), (1, Term(type=Str, field=0, val="form")), (2, Term(type=Str, field=0, val="encoded"))]""#,
|
||||
r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -1090,7 +1093,7 @@ mod test {
|
||||
for &default_conjunction in &[false, true] {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a AND title:b",
|
||||
r#"(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b"))"#,
|
||||
r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#,
|
||||
default_conjunction,
|
||||
);
|
||||
}
|
||||
@@ -1101,7 +1104,7 @@ mod test {
|
||||
for &default_conjunction in &[false, true] {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a OR title:b",
|
||||
r#"(Term(type=Str, field=0, val="a") Term(type=Str, field=0, val="b"))"#,
|
||||
r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#,
|
||||
default_conjunction,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -174,7 +174,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", term_query),
|
||||
r#"TermQuery(Term(type=Str, field=1, val="hello"))"#
|
||||
r#"TermQuery(Term(type=Str, field=1, "hello"))"#
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -137,6 +137,7 @@ impl FieldEntry {
|
||||
FieldType::Str(ref options) => options.is_stored(),
|
||||
FieldType::Facet(ref options) => options.is_stored(),
|
||||
FieldType::Bytes(ref options) => options.is_stored(),
|
||||
FieldType::JsonObject(ref options) => options.is_stored(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,25 +1,33 @@
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::facet_options::FacetOptions;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::JsonObjectOptions;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{IntOptions, TextOptions};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use chrono::{FixedOffset, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::facet_options::FacetOptions;
|
||||
use crate::schema::{Facet, IndexRecordOption, IntOptions, TextFieldIndexing, TextOptions, Value};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Possible error that may occur while parsing a field value
|
||||
/// At this point the JSON is known to be valid.
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Error)]
|
||||
pub enum ValueParsingError {
|
||||
/// Encountered a numerical value that overflows or underflow its integer type.
|
||||
OverflowError(String),
|
||||
/// The json node is not of the correct type.
|
||||
/// (e.g. 3 for a `Str` type or `"abc"` for a u64 type)
|
||||
/// Tantivy will try to autocast values.
|
||||
TypeError(String),
|
||||
/// The json node is a string but contains json that is
|
||||
/// not valid base64.
|
||||
InvalidBase64(String),
|
||||
#[error("Overflow error. Expected {expected}, got {json}")]
|
||||
OverflowError {
|
||||
expected: &'static str,
|
||||
json: serde_json::Value,
|
||||
},
|
||||
#[error("Type error. Expected {expected}, got {json}")]
|
||||
TypeError {
|
||||
expected: &'static str,
|
||||
json: serde_json::Value,
|
||||
},
|
||||
#[error("Invalid base64: {base64}")]
|
||||
InvalidBase64 { base64: String },
|
||||
}
|
||||
|
||||
/// Type of the value that a field can take.
|
||||
@@ -67,6 +75,18 @@ impl Type {
|
||||
*self as u8
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
Type::Str => "Str",
|
||||
Type::U64 => "U64",
|
||||
Type::I64 => "I64",
|
||||
Type::F64 => "F64",
|
||||
Type::Date => "Date",
|
||||
Type::Facet => "Facet",
|
||||
Type::Bytes => "Bytes",
|
||||
}
|
||||
}
|
||||
|
||||
/// Interprets a 1byte code as a type.
|
||||
/// Returns None if the code is invalid.
|
||||
pub fn from_code(code: u8) -> Option<Self> {
|
||||
@@ -104,6 +124,8 @@ pub enum FieldType {
|
||||
Facet(FacetOptions),
|
||||
/// Bytes (one per document)
|
||||
Bytes(BytesOptions),
|
||||
/// Json object
|
||||
JsonObject(JsonObjectOptions),
|
||||
}
|
||||
|
||||
impl FieldType {
|
||||
@@ -117,6 +139,9 @@ impl FieldType {
|
||||
FieldType::Date(_) => Type::Date,
|
||||
FieldType::Facet(_) => Type::Facet,
|
||||
FieldType::Bytes(_) => Type::Bytes,
|
||||
FieldType::JsonObject(_) => {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,6 +155,7 @@ impl FieldType {
|
||||
FieldType::Date(ref date_options) => date_options.is_indexed(),
|
||||
FieldType::Facet(ref _facet_options) => true,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(),
|
||||
FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,12 +172,17 @@ impl FieldType {
|
||||
| FieldType::Date(ref int_options) => int_options.fieldnorms(),
|
||||
FieldType::Facet(_) => false,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
|
||||
FieldType::JsonObject(ref json_object_options) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a field configuration, return the maximal possible
|
||||
/// `IndexRecordOption` available.
|
||||
///
|
||||
/// For the Json object, this does not necessarily mean it is the index record
|
||||
/// option level is available for all terms.
|
||||
/// (Non string terms have the Basic indexing option at most.)
|
||||
///
|
||||
/// If the field is not indexed, then returns `None`.
|
||||
pub fn get_index_record_option(&self) -> Option<IndexRecordOption> {
|
||||
match *self {
|
||||
@@ -176,6 +207,10 @@ impl FieldType {
|
||||
None
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(ref json_obj_options) => json_obj_options
|
||||
.indexing
|
||||
.as_ref()
|
||||
.map(TextFieldIndexing::index_option),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,25 +224,30 @@ impl FieldType {
|
||||
JsonValue::String(ref field_text) => match *self {
|
||||
FieldType::Date(_) => {
|
||||
let dt_with_fixed_tz: chrono::DateTime<FixedOffset> =
|
||||
chrono::DateTime::parse_from_rfc3339(field_text).map_err(|err| {
|
||||
ValueParsingError::TypeError(format!(
|
||||
"Failed to parse date from JSON. Expected rfc3339 format, got {}. \
|
||||
{:?}",
|
||||
field_text, err
|
||||
))
|
||||
chrono::DateTime::parse_from_rfc3339(field_text).map_err(|_err| {
|
||||
ValueParsingError::TypeError {
|
||||
expected: "rfc3339 format",
|
||||
json: json.clone(),
|
||||
}
|
||||
})?;
|
||||
Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc)))
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err(
|
||||
ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)),
|
||||
),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "an integer",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| {
|
||||
ValueParsingError::InvalidBase64(format!(
|
||||
"Expected base64 string, got {:?}",
|
||||
field_text
|
||||
))
|
||||
ValueParsingError::InvalidBase64 {
|
||||
base64: field_text.clone(),
|
||||
}
|
||||
}),
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: json.clone(),
|
||||
}),
|
||||
},
|
||||
JsonValue::Number(ref field_val_num) => match *self {
|
||||
@@ -215,30 +255,42 @@ impl FieldType {
|
||||
if let Some(field_val_i64) = field_val_num.as_i64() {
|
||||
Ok(Value::I64(field_val_i64))
|
||||
} else {
|
||||
let msg = format!("Expected an i64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "an i64 int",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
if let Some(field_val_u64) = field_val_num.as_u64() {
|
||||
Ok(Value::U64(field_val_u64))
|
||||
} else {
|
||||
let msg = format!("Expected a u64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "u64",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
if let Some(field_val_f64) = field_val_num.as_f64() {
|
||||
Ok(Value::F64(field_val_f64))
|
||||
} else {
|
||||
let msg = format!("Expected a f64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "a f64",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => {
|
||||
let msg = format!("Expected a string, got {:?}", json);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: json.clone(),
|
||||
}),
|
||||
},
|
||||
JsonValue::Object(_) => match *self {
|
||||
FieldType::Str(_) => {
|
||||
@@ -247,28 +299,21 @@ impl FieldType {
|
||||
{
|
||||
Ok(Value::PreTokStr(tok_str_val))
|
||||
} else {
|
||||
let msg = format!(
|
||||
"Json value {:?} cannot be translated to PreTokenizedString.",
|
||||
json
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string or an pretokenized string",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json, self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: json.clone(),
|
||||
}),
|
||||
},
|
||||
_ => {
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json, self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: json.clone(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -317,13 +362,13 @@ mod tests {
|
||||
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(&json!(521));
|
||||
match result {
|
||||
Err(ValueParsingError::TypeError(_)) => {}
|
||||
Err(ValueParsingError::TypeError { .. }) => {}
|
||||
_ => panic!("Expected parse failure for wrong type"),
|
||||
}
|
||||
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(&json!("-"));
|
||||
match result {
|
||||
Err(ValueParsingError::InvalidBase64(_)) => {}
|
||||
Err(ValueParsingError::InvalidBase64 { .. }) => {}
|
||||
_ => panic!("Expected parse failure for invalid base64"),
|
||||
}
|
||||
}
|
||||
|
||||
25
src/schema/json_object_options.rs
Normal file
25
src/schema/json_object_options.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::schema::TextFieldIndexing;
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct JsonObjectOptions {
|
||||
stored: bool,
|
||||
// If set to some, int, date, f64 and text will be indexed.
|
||||
// Text will use the TextFieldIndexing setting for indexing.
|
||||
pub(crate) indexing: Option<TextFieldIndexing>,
|
||||
}
|
||||
|
||||
impl JsonObjectOptions {
|
||||
pub fn is_stored(&self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
self.indexing.is_some()
|
||||
}
|
||||
|
||||
pub fn get_text_indexing_option(&self) -> Option<&TextFieldIndexing> {
|
||||
self.indexing.as_ref()
|
||||
}
|
||||
}
|
||||
25
src/schema/json_object_utils.rs
Normal file
25
src/schema/json_object_utils.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use serde_json::Number;
|
||||
|
||||
use crate::schema::Value;
|
||||
|
||||
pub fn infer_type_from_string(string: &str) -> Value {
|
||||
// TODO can we avoid the copy?
|
||||
Value::Str(string.to_string())
|
||||
}
|
||||
|
||||
pub fn infer_type_from_number(number: &Number) -> Value {
|
||||
if let Some(num_val) = number.as_u64() {
|
||||
return Value::U64(num_val);
|
||||
}
|
||||
if let Some(num_val) = number.as_i64() {
|
||||
return Value::I64(num_val);
|
||||
}
|
||||
if let Some(num_val) = number.as_f64() {
|
||||
return Value::F64(num_val);
|
||||
}
|
||||
// This should never happen as long as we
|
||||
// don't use the serde_json feature = "arbitrary_precision".
|
||||
Value::Str(number.to_string())
|
||||
}
|
||||
|
||||
// TODO add unit tests
|
||||
@@ -104,7 +104,7 @@ mod document;
|
||||
mod facet;
|
||||
mod facet_options;
|
||||
mod schema;
|
||||
mod term;
|
||||
pub(crate) mod term;
|
||||
|
||||
mod field_entry;
|
||||
mod field_type;
|
||||
@@ -114,11 +114,14 @@ mod bytes_options;
|
||||
mod field;
|
||||
mod index_record_option;
|
||||
mod int_options;
|
||||
mod json_object_options;
|
||||
mod named_field_document;
|
||||
mod text_options;
|
||||
mod value;
|
||||
|
||||
mod flags;
|
||||
mod json_object_utils;
|
||||
mod term_writer;
|
||||
|
||||
pub use self::bytes_options::BytesOptions;
|
||||
pub use self::document::Document;
|
||||
@@ -138,6 +141,8 @@ pub use self::term::Term;
|
||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||
pub use self::value::Value;
|
||||
|
||||
pub use self::json_object_options::JsonObjectOptions;
|
||||
|
||||
/// Validator for a potential `field_name`.
|
||||
/// Returns true if the name can be use for a field name.
|
||||
///
|
||||
|
||||
@@ -666,7 +666,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::TypeError(_)
|
||||
ValueParsingError::TypeError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
@@ -684,7 +684,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
@@ -702,7 +702,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
));
|
||||
}
|
||||
@@ -720,7 +720,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -8,8 +8,27 @@ use crate::DateTime;
|
||||
|
||||
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
|
||||
/// <field> + <type byte> + <value len>
|
||||
///
|
||||
/// - <field> is a big endian encoded u32 field id
|
||||
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not
|
||||
/// The remaining 7 bits are used to encode the type of the value.
|
||||
/// If this is a JSON term, the type is the type of the leaf of the json.
|
||||
///
|
||||
/// - <value> is, if this is not the json term, a binary representation specific to the type.
|
||||
/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value.
|
||||
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
|
||||
/// The first bit of the `type_byte`
|
||||
/// is used to encode whether a Term is for
|
||||
/// json or not.
|
||||
pub const JSON_MARKER_BIT: u8 = 128u8;
|
||||
/// Separates the different segments of
|
||||
/// the json path.
|
||||
pub const JSON_PATH_SEGMENT_SEP: u8 = 0u8;
|
||||
/// Separates the json path and the value in
|
||||
/// a JSON term binary representation.
|
||||
pub const JSON_END_OF_PATH: u8 = 30u8;
|
||||
|
||||
/// Term represents the value that the token can take.
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
@@ -164,13 +183,23 @@ where B: AsRef<[u8]>
|
||||
Term(data)
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
pub fn typ(&self) -> Type {
|
||||
/// Returns true iff the term is a JSON term.
|
||||
pub fn is_json(&self) -> bool {
|
||||
self.typ_code() & JSON_MARKER_BIT == JSON_MARKER_BIT
|
||||
}
|
||||
|
||||
fn typ_code(&self) -> u8 {
|
||||
assert!(
|
||||
self.as_slice().len() >= 5,
|
||||
"the type does byte representation is too short"
|
||||
"the byte representation is too short"
|
||||
);
|
||||
Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code")
|
||||
self.as_slice()[4]
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
pub fn typ(&self) -> Type {
|
||||
Type::from_code(self.typ_code() & (JSON_MARKER_BIT - 1))
|
||||
.expect("The term has an invalid type code")
|
||||
}
|
||||
|
||||
/// Returns the field.
|
||||
@@ -189,11 +218,15 @@ where B: AsRef<[u8]>
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
|
||||
if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN {
|
||||
if self.typ() != T::to_type() {
|
||||
return None;
|
||||
}
|
||||
let mut value_bytes = [0u8; 8];
|
||||
value_bytes.copy_from_slice(self.value_bytes());
|
||||
let bytes = self.value_bytes_without_path();
|
||||
if bytes.len() != 8 {
|
||||
return None;
|
||||
}
|
||||
value_bytes.copy_from_slice(self.value_bytes_without_path());
|
||||
let value_u64 = u64::from_be_bytes(value_bytes);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
}
|
||||
@@ -206,6 +239,35 @@ where B: AsRef<[u8]>
|
||||
self.get_fast_type::<i64>()
|
||||
}
|
||||
|
||||
fn json_path_value(&self) -> Option<&str> {
|
||||
if !self.is_json() {
|
||||
return None;
|
||||
}
|
||||
let value_bytes = self.value_bytes();
|
||||
let pos = value_bytes
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|b| b == JSON_END_OF_PATH)
|
||||
.expect("Could not find end of path");
|
||||
let json_path =
|
||||
str::from_utf8(&value_bytes[..pos]).expect("JSON value path is Invalid utf-8");
|
||||
Some(json_path)
|
||||
}
|
||||
|
||||
fn value_bytes_without_path(&self) -> &[u8] {
|
||||
let value_bytes = self.value_bytes();
|
||||
if self.is_json() {
|
||||
let pos = value_bytes
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|b| b == JSON_END_OF_PATH)
|
||||
.expect("Could not find end of path");
|
||||
&value_bytes[pos + 1..]
|
||||
} else {
|
||||
value_bytes
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `f64` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the f64 type, or if the term byte representation
|
||||
@@ -233,7 +295,7 @@ where B: AsRef<[u8]>
|
||||
if self.typ() != Type::Str {
|
||||
return None;
|
||||
}
|
||||
str::from_utf8(self.value_bytes()).ok()
|
||||
str::from_utf8(self.value_bytes_without_path()).ok()
|
||||
}
|
||||
|
||||
/// Returns the facet associated with the term.
|
||||
@@ -247,7 +309,7 @@ where B: AsRef<[u8]>
|
||||
if self.typ() != Type::Facet {
|
||||
return None;
|
||||
}
|
||||
let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?;
|
||||
let facet_encode_str = str::from_utf8(self.value_bytes_without_path()).ok()?;
|
||||
Some(Facet::from_encoded_string(facet_encode_str.to_string()))
|
||||
}
|
||||
|
||||
@@ -261,7 +323,7 @@ where B: AsRef<[u8]>
|
||||
if self.typ() != Type::Bytes {
|
||||
return None;
|
||||
}
|
||||
Some(self.value_bytes())
|
||||
Some(self.value_bytes_without_path())
|
||||
}
|
||||
|
||||
/// Returns the serialized value of the term.
|
||||
@@ -290,14 +352,24 @@ fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl fmt::Debug for Term {
|
||||
impl<B> fmt::Debug for Term<B>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let field_id = self.field().field_id();
|
||||
let typ = self.typ();
|
||||
write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?;
|
||||
write!(f, "Term(type={:?}, field={}, ", typ, field_id)?;
|
||||
if let Some(path) = self.json_path_value() {
|
||||
write!(
|
||||
f,
|
||||
"path={}, ",
|
||||
path.replace(std::str::from_utf8(&[JSON_PATH_SEGMENT_SEP]).unwrap(), ".")
|
||||
)?;
|
||||
}
|
||||
match typ {
|
||||
Type::Str => {
|
||||
let s = str::from_utf8(self.value_bytes()).ok();
|
||||
let s = self.as_str();
|
||||
write_opt(f, s)?;
|
||||
}
|
||||
Type::U64 => {
|
||||
|
||||
131
src/schema/term_writer.rs
Normal file
131
src/schema/term_writer.rs
Normal file
@@ -0,0 +1,131 @@
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::Term;
|
||||
|
||||
// Copyright (C) 2021 Quickwit, Inc.
|
||||
//
|
||||
// Quickwit is offered under the AGPL v3.0 and as commercial software.
|
||||
// For commercial licensing, contact us at hello@quickwit.io.
|
||||
//
|
||||
// AGPL:
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
|
||||
use super::term::JSON_MARKER_BIT;
|
||||
pub struct JsonTermWriter {
|
||||
buffer: Vec<u8>,
|
||||
path_stack: Vec<usize>,
|
||||
}
|
||||
|
||||
impl JsonTermWriter {
|
||||
pub fn new() -> Self {
|
||||
let mut buffer = Vec::with_capacity(100);
|
||||
buffer.resize(5, 0);
|
||||
let mut path_stack = Vec::with_capacity(10);
|
||||
path_stack.push(5);
|
||||
Self { buffer, path_stack }
|
||||
}
|
||||
|
||||
pub fn with_field(field: Field) -> Self {
|
||||
let mut json_term_writer: JsonTermWriter = Self::new();
|
||||
json_term_writer.set_field(field);
|
||||
json_term_writer
|
||||
}
|
||||
|
||||
fn trim_to_end_of_path(&mut self) {
|
||||
let end_of_path = *self.path_stack.last().unwrap();
|
||||
self.buffer.resize(end_of_path, 0u8);
|
||||
self.buffer[end_of_path - 1] = super::term::JSON_PATH_SEGMENT_SEP;
|
||||
}
|
||||
|
||||
fn close_path_and_set_type(&mut self, typ: Type) {
|
||||
self.trim_to_end_of_path();
|
||||
self.buffer[4] = JSON_MARKER_BIT | typ.to_code();
|
||||
let end_of_path = self.buffer.len();
|
||||
self.buffer[end_of_path - 1] = super::term::JSON_END_OF_PATH;
|
||||
}
|
||||
|
||||
pub fn push_path_segment(&mut self, segment: &str) {
|
||||
// the path stack should never be empty.
|
||||
self.trim_to_end_of_path();
|
||||
self.buffer.extend_from_slice(segment.as_bytes());
|
||||
self.buffer.push(super::term::JSON_PATH_SEGMENT_SEP);
|
||||
self.path_stack.push(self.buffer.len());
|
||||
}
|
||||
|
||||
pub fn pop_path_segment(&mut self) {
|
||||
self.path_stack.pop();
|
||||
assert!(self.path_stack.len() > 0);
|
||||
self.trim_to_end_of_path();
|
||||
}
|
||||
|
||||
pub fn set_text(&mut self, text: &str) {
|
||||
self.close_path_and_set_type(Type::Str);
|
||||
self.buffer.extend_from_slice(text.as_bytes());
|
||||
}
|
||||
|
||||
pub fn set_i64(&mut self, val: i64) {
|
||||
self.close_path_and_set_type(Type::I64);
|
||||
self.buffer
|
||||
.extend_from_slice(val.to_u64().to_be_bytes().as_slice());
|
||||
}
|
||||
|
||||
pub fn set_field(&mut self, field: Field) {
|
||||
self.buffer[0..4].copy_from_slice(field.field_id().to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
pub fn term(&self) -> Term<&[u8]> {
|
||||
Term::wrap(&self.buffer[..])
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::Field;
|
||||
|
||||
use super::JsonTermWriter;
|
||||
|
||||
#[test]
|
||||
fn test_json_writer() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut json_writer = JsonTermWriter::with_field(field);
|
||||
json_writer.push_path_segment("attributes");
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_text("red");
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Str, field=1, path=attributes.color, \"red\")"
|
||||
);
|
||||
json_writer.set_text("blue");
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Str, field=1, path=attributes.color, \"blue\")"
|
||||
);
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.push_path_segment("dimensions");
|
||||
json_writer.push_path_segment("width");
|
||||
json_writer.set_i64(400);
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=I64, field=1, path=attributes.dimensions.width, 400)"
|
||||
);
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.push_path_segment("height");
|
||||
json_writer.set_i64(300);
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=I64, field=1, path=attributes.dimensions.height, 300)"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,7 +46,7 @@ impl TextOptions {
|
||||
/// Essentially, should we store the term frequency and/or the positions (See
|
||||
/// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
|
||||
/// - the name of the `Tokenizer` that should be used to process the field.
|
||||
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
|
||||
pub struct TextFieldIndexing {
|
||||
record: IndexRecordOption,
|
||||
fieldnorms: bool,
|
||||
|
||||
@@ -27,6 +27,7 @@ pub enum Value {
|
||||
Facet(Facet),
|
||||
/// Arbitrarily sized byte array
|
||||
Bytes(Vec<u8>),
|
||||
JsonObject(serde_json::Map<String, serde_json::Value>),
|
||||
}
|
||||
|
||||
impl Eq for Value {}
|
||||
@@ -43,6 +44,7 @@ impl Serialize for Value {
|
||||
Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
|
||||
Value::JsonObject(ref obj) => obj.serialize(serializer),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -248,6 +250,7 @@ mod binary_serialize {
|
||||
const DATE_CODE: u8 = 5;
|
||||
const F64_CODE: u8 = 6;
|
||||
const EXT_CODE: u8 = 7;
|
||||
const JSON_OBJ_CODE: u8 = 8;
|
||||
|
||||
// extended types
|
||||
|
||||
@@ -296,8 +299,14 @@ mod binary_serialize {
|
||||
BYTES_CODE.serialize(writer)?;
|
||||
bytes.serialize(writer)
|
||||
}
|
||||
Value::JsonObject(ref map) => {
|
||||
JSON_OBJ_CODE.serialize(writer)?;
|
||||
serde_json::to_writer(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let type_code = u8::deserialize(reader)?;
|
||||
match type_code {
|
||||
@@ -347,6 +356,10 @@ mod binary_serialize {
|
||||
)),
|
||||
}
|
||||
}
|
||||
JSON_OBJ_CODE => {
|
||||
let map = serde_json::from_reader(reader)?;
|
||||
Ok(Value::JsonObject(map))
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {:?}", type_code),
|
||||
|
||||
Reference in New Issue
Block a user