refactor Term (#2006)

* refactor Term

add ValueBytes for serialized term values
add missing debug for ip
skip unnecessary json path validation
remove code duplication
add DATE_TIME_PRECISION_INDEXED constant
add missing Term clarification
remove weird value_bytes_mut() API

* fix naming
This commit is contained in:
PSeitz
2023-04-20 21:31:43 +08:00
committed by GitHub
parent ff3d3313c4
commit 74f9eafefc
18 changed files with 394 additions and 273 deletions

View File

@@ -1,5 +1,5 @@
//! SIMD filtering of a vector as described in the following blog post.
//! https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512
//! <https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512>
use std::arch::x86_64::{
__m256i as DataType, _mm256_add_epi32 as op_add, _mm256_cmpgt_epi32 as op_greater,
_mm256_lddqu_si256 as load_unaligned, _mm256_or_si256 as op_or, _mm256_set1_epi32 as set1,

View File

@@ -61,7 +61,7 @@ impl InvertedIndexReader {
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>> {
self.termdict.get(term.value_bytes())
self.termdict.get(term.serialized_value_bytes())
}
/// Return the term dictionary datastructure.
@@ -203,7 +203,7 @@ impl InvertedIndexReader {
#[cfg(feature = "quickwit")]
impl InvertedIndexReader {
pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result<Option<TermInfo>> {
self.termdict.get_async(term.value_bytes()).await
self.termdict.get_async(term.serialized_value_bytes()).await
}
/// Returns a block postings given a `Term`.

View File

@@ -5,12 +5,12 @@ use rustc_hash::FxHashMap;
use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{Field, Type};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
use crate::tokenizer::TextAnalyzer;
use crate::{DatePrecision, DateTime, DocId, Term};
use crate::{DateTime, DocId, Term};
/// This object is a map storing the last position for a given path for the current document
/// being indexed.
@@ -59,7 +59,7 @@ struct IndexingPositionsPerPath {
impl IndexingPositionsPerPath {
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
self.positions_per_path
.entry(murmurhash2(term.as_slice()))
.entry(murmurhash2(term.serialized_term()))
.or_insert_with(Default::default)
}
}
@@ -257,6 +257,9 @@ pub(crate) fn set_string_and_get_terms(
positions_and_terms
}
/// Writes a value of a JSON field to a `Term`.
/// The Term format is as follows:
/// [JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES]
pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,
@@ -355,27 +358,23 @@ impl<'a> JsonTermWriter<'a> {
pub fn close_path_and_set_type(&mut self, typ: Type) {
self.trim_to_end_of_path();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();
buffer[buffer_len - 1] = JSON_END_OF_PATH;
self.term_buffer.set_json_path_end();
self.term_buffer.append_bytes(&[typ.to_code()]);
}
pub fn push_path_segment(&mut self, segment: &str) {
// the path stack should never be empty.
self.trim_to_end_of_path();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();
if self.path_stack.len() > 1 {
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
self.term_buffer.set_json_path_separator();
}
let appended_segment = self.term_buffer.append_bytes(segment.as_bytes());
if self.expand_dots_enabled {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment);
}
self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP);
self.term_buffer.add_json_path_separator();
self.path_stack.push(self.term_buffer.len_bytes());
}
@@ -389,14 +388,14 @@ impl<'a> JsonTermWriter<'a> {
#[cfg(test)]
pub(crate) fn path(&self) -> &[u8] {
let end_of_path = self.path_stack.last().cloned().unwrap_or(1);
&self.term().value_bytes()[..end_of_path - 1]
&self.term().serialized_value_bytes()[..end_of_path - 1]
}
pub(crate) fn set_fast_value<T: FastValue>(&mut self, val: T) {
self.close_path_and_set_type(T::to_type());
let value = if T::to_type() == Type::Date {
DateTime::from_u64(val.to_u64())
.truncate(DatePrecision::Seconds)
.truncate(DATE_TIME_PRECISION_INDEXED)
.to_u64()
} else {
val.to_u64()
@@ -431,12 +430,12 @@ mod tests {
json_writer.set_str("red");
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Json, field=1, path=attributes.color, vtype=Str, \"red\")"
"Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
);
json_writer.set_str("blue");
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Json, field=1, path=attributes.color, vtype=Str, \"blue\")"
"Term(field=1, type=Json, path=attributes.color, type=Str, \"blue\")"
);
json_writer.pop_path_segment();
json_writer.push_path_segment("dimensions");
@@ -444,14 +443,14 @@ mod tests {
json_writer.set_fast_value(400i64);
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Json, field=1, path=attributes.dimensions.width, vtype=I64, 400)"
"Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
);
json_writer.pop_path_segment();
json_writer.push_path_segment("height");
json_writer.set_fast_value(300i64);
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Json, field=1, path=attributes.dimensions.height, vtype=I64, 300)"
"Term(field=1, type=Json, path=attributes.dimensions.height, type=I64, 300)"
);
}
@@ -463,7 +462,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00sred"
)
}
@@ -476,7 +475,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(-4i64);
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
)
}
@@ -489,7 +488,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(4u64);
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
)
}
@@ -502,7 +501,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(4.0f64);
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
)
}
@@ -515,7 +514,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(true);
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
)
}
@@ -530,7 +529,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jattribute\x01color\x00sred"
)
}
@@ -545,7 +544,7 @@ mod tests {
json_writer.pop_path_segment();
json_writer.set_str("red");
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00sred"
)
}

View File

@@ -14,7 +14,7 @@
//! Fields have to be declared as `FAST` in the schema.
//! Currently supported fields are: u64, i64, f64, bytes, ip and text.
//!
//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
//! Fast fields are stored in with [different codecs](columnar). The best codec is detected
//! automatically, when serializing.
//!
//! Read access performance is comparable to that of an array lookup.

View File

@@ -12,10 +12,10 @@ use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value};
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DatePrecision, DocId, Document, Opstamp, SegmentComponent};
use crate::{DocId, Document, Opstamp, SegmentComponent};
/// Computes the initial size of the hash table.
///
@@ -246,7 +246,8 @@ impl SegmentWriter {
for value in values {
num_vals += 1;
let date_val = value.as_date().ok_or_else(make_schema_error)?;
term_buffer.set_u64(date_val.truncate(DatePrecision::Seconds).to_u64());
term_buffer
.set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
@@ -551,14 +552,20 @@ mod tests {
json_term_writer.push_path_segment("bool");
json_term_writer.set_fast_value(true);
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.pop_path_segment();
json_term_writer.push_path_segment("complexobject");
json_term_writer.push_path_segment("field.with.dot");
json_term_writer.set_fast_value(1i64);
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.pop_path_segment();
json_term_writer.pop_path_segment();
@@ -567,55 +574,85 @@ mod tests {
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
));
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.pop_path_segment();
json_term_writer.push_path_segment("float");
json_term_writer.set_fast_value(-0.2f64);
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.pop_path_segment();
json_term_writer.push_path_segment("my_arr");
json_term_writer.set_fast_value(2i64);
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.set_fast_value(3i64);
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.set_fast_value(4i64);
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.push_path_segment("my_key");
json_term_writer.set_str("tokens");
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.set_str("two");
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.pop_path_segment();
json_term_writer.pop_path_segment();
json_term_writer.push_path_segment("signed");
json_term_writer.set_fast_value(-2i64);
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.pop_path_segment();
json_term_writer.push_path_segment("toto");
json_term_writer.set_str("titi");
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
json_term_writer.pop_path_segment();
json_term_writer.push_path_segment("unsigned");
json_term_writer.set_fast_value(1i64);
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
assert_eq!(
term_stream.key(),
json_term_writer.term().serialized_value_bytes()
);
assert!(!term_stream.advance());
}

View File

@@ -6,7 +6,6 @@ use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::term::as_json_path_type_value_bytes;
use crate::schema::Type;
use crate::tokenizer::TokenStream;
use crate::{DocId, Term};
@@ -61,8 +60,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr) in term_addrs {
// TODO optimization opportunity here.
if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) {
if let Some(json_value) = term.value().as_json_value_bytes() {
let typ = json_value.typ();
if typ == Type::Str {
SpecializedPostingsWriter::<Rec>::serialize_one_term(
term,

View File

@@ -171,7 +171,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
) -> io::Result<()> {
let recorder: Rec = ctx.term_index.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
serializer.new_term(term.value_bytes(), term_doc_freq)?;
serializer.new_term(term.serialized_value_bytes(), term_doc_freq)?;
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
serializer.close_term()?;
Ok(())
@@ -180,10 +180,10 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) {
debug_assert!(term.as_slice().len() >= 4);
debug_assert!(term.serialized_term().len() >= 4);
self.total_num_tokens += 1;
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| {
term_index.mutate_or_create(term.serialized_term(), |opt_recorder: Option<Rec>| {
if let Some(mut recorder) = opt_recorder {
let current_doc = recorder.current_doc();
if current_doc != doc {

View File

@@ -131,7 +131,8 @@ impl FuzzyTermQuery {
LevenshteinAutomatonBuilder::new(self.distance, self.transposition_cost_one)
});
let term_text = self.term.as_str().ok_or_else(|| {
let term_value = self.term.value();
let term_text = term_value.as_str().ok_or_else(|| {
InvalidArgument("The fuzzy term query requires a string term.".to_string())
})?;
let automaton = if self.prefix {

View File

@@ -138,14 +138,15 @@ impl Query for PhrasePrefixQuery {
Ok(Box::new(phrase_weight))
} else {
// There are no prefix. Let's just match the suffix.
let end_term = if let Some(end_value) = prefix_end(self.prefix.1.value_bytes()) {
let mut end_term = Term::with_capacity(end_value.len());
end_term.set_field_and_type(self.field, self.prefix.1.typ());
end_term.append_bytes(&end_value);
Bound::Excluded(end_term)
} else {
Bound::Unbounded
};
let end_term =
if let Some(end_value) = prefix_end(self.prefix.1.serialized_value_bytes()) {
let mut end_term = Term::with_capacity(end_value.len());
end_term.set_field_and_type(self.field, self.prefix.1.typ());
end_term.append_bytes(&end_value);
Bound::Excluded(end_term)
} else {
Bound::Unbounded
};
let mut range_query = RangeQuery::new_term_bounds(
enable_scoring

View File

@@ -78,8 +78,11 @@ impl PhrasePrefixWeight {
}
let inv_index = reader.inverted_index(self.prefix.1.field())?;
let mut stream = inv_index.terms().range().ge(self.prefix.1.value_bytes());
if let Some(end) = prefix_end(self.prefix.1.value_bytes()) {
let mut stream = inv_index
.terms()
.range()
.ge(self.prefix.1.serialized_value_bytes());
if let Some(end) = prefix_end(self.prefix.1.serialized_value_bytes()) {
stream = stream.lt(&end);
}

View File

@@ -952,7 +952,7 @@ mod test {
let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap();
assert_eq!(
format!("{:?}", query),
r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"#
r#"TermQuery(Term(field=11, type=Facet, Facet(/root/branch/leaf)))"#
);
}
@@ -965,7 +965,7 @@ mod test {
let query = query_parser.parse_query("text:hello").unwrap();
assert_eq!(
format!("{:?}", query),
r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"#
r#"Boost(query=TermQuery(Term(field=1, type=Str, "hello")), boost=2)"#
);
}
@@ -988,7 +988,7 @@ mod test {
let query = query_parser.parse_query("text:hello^2").unwrap();
assert_eq!(
format!("{:?}", query),
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"#
r#"Boost(query=Boost(query=TermQuery(Term(field=1, type=Str, "hello")), boost=2), boost=2)"#
);
}
@@ -1027,7 +1027,7 @@ mod test {
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper(
"nottokenized:\"wordone wordtwo\"",
r#"Term(type=Str, field=7, "wordone wordtwo")"#,
r#"Term(field=7, type=Str, "wordone wordtwo")"#,
false,
);
}
@@ -1070,7 +1070,7 @@ mod test {
.is_ok());
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term(type=U64, field=3, 2324)",
"Term(field=3, type=U64, 2324)",
false,
);
@@ -1097,7 +1097,7 @@ mod test {
fn test_parse_bytes() {
test_parse_query_to_logical_ast_helper(
"bytes:YnVidQ==",
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
"Term(field=12, type=Bytes, [98, 117, 98, 117])",
false,
);
}
@@ -1124,7 +1124,7 @@ mod test {
fn test_json_field() {
test_parse_query_to_logical_ast_helper(
"json.titi:hello",
"Term(type=Json, field=14, path=titi, vtype=Str, \"hello\")",
"Term(field=14, type=Json, path=titi, type=Str, \"hello\")",
false,
);
}
@@ -1136,7 +1136,9 @@ mod test {
let LogicalLiteral::Term(term) = *literal else {
panic!();
};
std::str::from_utf8(term.value_bytes()).unwrap().to_string()
std::str::from_utf8(term.serialized_value_bytes())
.unwrap()
.to_string()
}
#[test]
@@ -1155,17 +1157,17 @@ mod test {
fn test_json_field_possibly_a_number() {
test_parse_query_to_logical_ast_helper(
"json.titi:5",
r#"(Term(type=Json, field=14, path=titi, vtype=U64, 5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#,
r#"(Term(field=14, type=Json, path=titi, type=U64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"json.titi:-5",
r#"(Term(type=Json, field=14, path=titi, vtype=I64, -5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-".
r#"(Term(field=14, type=Json, path=titi, type=I64, -5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-".
true,
);
test_parse_query_to_logical_ast_helper(
"json.titi:-5.2",
r#"(Term(type=Json, field=14, path=titi, vtype=F64, -5.2) "[(0, Term(type=Json, field=14, path=titi, vtype=Str, "5")), (1, Term(type=Json, field=14, path=titi, vtype=Str, "2"))]")"#,
r#"(Term(field=14, type=Json, path=titi, type=F64, -5.2) "[(0, Term(field=14, type=Json, path=titi, type=Str, "5")), (1, Term(field=14, type=Json, path=titi, type=Str, "2"))]")"#,
true,
);
}
@@ -1174,7 +1176,7 @@ mod test {
fn test_json_field_possibly_a_date() {
test_parse_query_to_logical_ast_helper(
r#"json.date:"2019-10-12T07:20:50.52Z""#,
r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#,
r#"(Term(field=14, type=Json, path=date, type=Date, 2019-10-12T07:20:50Z) "[(0, Term(field=14, type=Json, path=date, type=Str, "2019")), (1, Term(field=14, type=Json, path=date, type=Str, "10")), (2, Term(field=14, type=Json, path=date, type=Str, "12t07")), (3, Term(field=14, type=Json, path=date, type=Str, "20")), (4, Term(field=14, type=Json, path=date, type=Str, "50")), (5, Term(field=14, type=Json, path=date, type=Str, "52z"))]")"#,
true,
);
}
@@ -1183,7 +1185,7 @@ mod test {
fn test_json_field_possibly_a_bool() {
test_parse_query_to_logical_ast_helper(
"json.titi:true",
r#"(Term(type=Json, field=14, path=titi, vtype=Bool, true) Term(type=Json, field=14, path=titi, vtype=Str, "true"))"#,
r#"(Term(field=14, type=Json, path=titi, type=Bool, true) Term(field=14, type=Json, path=titi, type=Str, "true"))"#,
true,
);
}
@@ -1212,8 +1214,8 @@ mod test {
fn test_json_default() {
test_query_to_logical_ast_with_default_json(
"titi:4",
"(Term(type=Json, field=14, path=titi, vtype=U64, 4) Term(type=Json, field=14, \
path=titi, vtype=Str, \"4\"))",
"(Term(field=14, type=Json, path=titi, type=U64, 4) Term(field=14, type=Json, \
path=titi, type=Str, \"4\"))",
false,
);
}
@@ -1223,7 +1225,7 @@ mod test {
for conjunction in [false, true] {
test_query_to_logical_ast_with_default_json(
"text:4",
r#"Term(type=Str, field=1, "4")"#,
r#"Term(field=1, type=Str, "4")"#,
conjunction,
);
}
@@ -1234,7 +1236,7 @@ mod test {
for conjunction in [false, true] {
test_query_to_logical_ast_with_default_json(
"json:4",
r#"(Term(type=Json, field=14, path=, vtype=U64, 4) Term(type=Json, field=14, path=, vtype=Str, "4"))"#,
r#"(Term(field=14, type=Json, path=, type=U64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
conjunction,
);
}
@@ -1244,7 +1246,7 @@ mod test {
fn test_parse_bytes_phrase() {
test_parse_query_to_logical_ast_helper(
"bytes:\"YnVidQ==\"",
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
"Term(field=12, type=Bytes, [98, 117, 98, 117])",
false,
);
}
@@ -1260,12 +1262,12 @@ mod test {
fn test_parse_query_to_ast_ab_c() {
test_parse_query_to_logical_ast_helper(
"(+title:a +title:b) title:c",
r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#,
r#"((+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b")) Term(field=0, type=Str, "c"))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"(+title:a +title:b) title:c",
r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#,
r#"(+(+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b")) +Term(field=0, type=Str, "c"))"#,
true,
);
}
@@ -1274,17 +1276,17 @@ mod test {
pub fn test_parse_query_to_ast_single_term() {
test_parse_query_to_logical_ast_helper(
"title:toto",
r#"Term(type=Str, field=0, "toto")"#,
r#"Term(field=0, type=Str, "toto")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
r#"Term(type=Str, field=0, "toto")"#,
r#"Term(field=0, type=Str, "toto")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
r#"(+Term(field=0, type=Str, "toto") -(Term(field=0, type=Str, "titi") Term(field=1, type=Str, "titi")))"#,
false,
);
}
@@ -1301,12 +1303,12 @@ mod test {
pub fn test_parse_query_to_ast_two_terms() {
test_parse_query_to_logical_ast_helper(
"title:a b",
r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
r#"(Term(field=0, type=Str, "a") (Term(field=0, type=Str, "b") Term(field=1, type=Str, "b")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
r#"title:"a b""#,
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
r#""[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b"))]""#,
false,
);
}
@@ -1329,37 +1331,37 @@ mod test {
pub fn test_parse_query_to_ast_ranges() {
test_parse_query_to_logical_ast_helper(
"title:[a TO b]",
r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#,
r#"(Included(Term(field=0, type=Str, "a")) TO Included(Term(field=0, type=Str, "b")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO toto}",
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#,
r#"(Excluded(Term(field=0, type=Str, "titi")) TO Excluded(Term(field=0, type=Str, "toto")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{* TO toto}",
r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#,
r#"(Unbounded TO Excluded(Term(field=0, type=Str, "toto")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO *}",
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#,
r#"(Excluded(Term(field=0, type=Str, "titi")) TO Unbounded)"#,
false,
);
test_parse_query_to_logical_ast_helper(
"signed:{-5 TO 3}",
r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#,
r#"(Excluded(Term(field=2, type=I64, -5)) TO Excluded(Term(field=2, type=I64, 3)))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"float:{-1.5 TO 1.5}",
r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#,
r#"(Excluded(Term(field=10, type=F64, -1.5)) TO Excluded(Term(field=10, type=F64, 1.5)))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"u64_ff:[7 TO 77]",
r#"(Included(Term(type=U64, field=18, 7)) TO Included(Term(type=U64, field=18, 77)))"#,
r#"(Included(Term(field=18, type=U64, 7)) TO Included(Term(field=18, type=U64, 77)))"#,
false,
);
}
@@ -1462,12 +1464,12 @@ mod test {
);
test_parse_query_to_logical_ast_helper(
r#"date:"2010-11-21T09:55:06.000000000+02:00""#,
r#"Term(type=Date, field=9, 2010-11-21T07:55:06Z)"#,
r#"Term(field=9, type=Date, 2010-11-21T07:55:06Z)"#,
true,
);
test_parse_query_to_logical_ast_helper(
r#"date:"1985-04-12T23:20:50.52Z""#,
r#"Term(type=Date, field=9, 1985-04-12T23:20:50Z)"#,
r#"Term(field=9, type=Date, 1985-04-12T23:20:50Z)"#,
true,
);
}
@@ -1508,27 +1510,27 @@ mod test {
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper(
"title:toto",
r#"Term(type=Str, field=0, "toto")"#,
r#"Term(field=0, type=Str, "toto")"#,
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
r#"Term(type=Str, field=0, "toto")"#,
r#"Term(field=0, type=Str, "toto")"#,
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
r#"(+Term(field=0, type=Str, "toto") -(Term(field=0, type=Str, "titi") Term(field=1, type=Str, "titi")))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"title:a b",
r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
r#"(+Term(field=0, type=Str, "a") +(Term(field=0, type=Str, "b") Term(field=1, type=Str, "b")))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
r#""[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b"))]""#,
true,
);
}
@@ -1537,7 +1539,7 @@ mod test {
pub fn test_query_parser_hyphen() {
test_parse_query_to_logical_ast_helper(
"title:www-form-encoded",
r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#,
r#""[(0, Term(field=0, type=Str, "www")), (1, Term(field=0, type=Str, "form")), (2, Term(field=0, type=Str, "encoded"))]""#,
false,
);
}
@@ -1547,7 +1549,7 @@ mod test {
for &default_conjunction in &[false, true] {
test_parse_query_to_logical_ast_helper(
"title:a AND title:b",
r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#,
r#"(+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b"))"#,
default_conjunction,
);
}
@@ -1558,7 +1560,7 @@ mod test {
for &default_conjunction in &[false, true] {
test_parse_query_to_logical_ast_helper(
"title:a OR title:b",
r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#,
r#"(Term(field=0, type=Str, "a") Term(field=0, type=Str, "b"))"#,
default_conjunction,
);
}
@@ -1573,7 +1575,7 @@ mod test {
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
assert_eq!(
format!("{:?}", query),
"TermQuery(Term(type=Str, field=0, \"hello\"))"
"TermQuery(Term(field=0, type=Str, \"hello\"))"
);
}
@@ -1614,17 +1616,17 @@ mod test {
pub fn test_phrase_slop() {
test_parse_query_to_logical_ast_helper(
"\"a b\"~0",
r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]" "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]")"#,
r#"("[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b"))]" "[(0, Term(field=1, type=Str, "a")), (1, Term(field=1, type=Str, "b"))]")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"\"a b\"~2",
r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]"~2 "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]"~2)"#,
r#"("[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b"))]"~2 "[(0, Term(field=1, type=Str, "a")), (1, Term(field=1, type=Str, "b"))]"~2)"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b~4\"~2",
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b")), (2, Term(type=Str, field=0, "4"))]"~2"#,
r#""[(0, Term(field=0, type=Str, "a")), (1, Term(field=0, type=Str, "b")), (2, Term(field=0, type=Str, "4"))]"~2"#,
false,
);
}
@@ -1633,23 +1635,23 @@ mod test {
pub fn test_term_set_query() {
test_parse_query_to_logical_ast_helper(
"title: IN [a b cd]",
r#"IN [Term(type=Str, field=0, "a"), Term(type=Str, field=0, "b"), Term(type=Str, field=0, "cd")]"#,
r#"IN [Term(field=0, type=Str, "a"), Term(field=0, type=Str, "b"), Term(field=0, type=Str, "cd")]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"bytes: IN [AA== ABA= ABCD]",
r#"IN [Term(type=Bytes, field=12, [0]), Term(type=Bytes, field=12, [0, 16]), Term(type=Bytes, field=12, [0, 16, 131])]"#,
r#"IN [Term(field=12, type=Bytes, [0]), Term(field=12, type=Bytes, [0, 16]), Term(field=12, type=Bytes, [0, 16, 131])]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"signed: IN [1 2 -3]",
r#"IN [Term(type=I64, field=2, 1), Term(type=I64, field=2, 2), Term(type=I64, field=2, -3)]"#,
r#"IN [Term(field=2, type=I64, 1), Term(field=2, type=I64, 2), Term(field=2, type=I64, -3)]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"float: IN [1.1 2.2 -3.3]",
r#"IN [Term(type=F64, field=10, 1.1), Term(type=F64, field=10, 2.2), Term(type=F64, field=10, -3.3)]"#,
r#"IN [Term(field=10, type=F64, 1.1), Term(field=10, type=F64, 2.2), Term(field=10, type=F64, -3.3)]"#,
false,
);
}
@@ -1667,9 +1669,9 @@ mod test {
let query = query_parser.parse_query("abc").unwrap();
assert_eq!(
format!("{:?}", query),
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(type=Str, \
field=0, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
(Should, TermQuery(Term(type=Str, field=1, \"abc\")))] }"
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))] }"
);
}
@@ -1684,8 +1686,8 @@ mod test {
let query = query_parser.parse_query("abc").unwrap();
assert_eq!(
format!("{:?}", query),
"BooleanQuery { subqueries: [(Should, TermQuery(Term(type=Str, field=0, \
\"abc\"))), (Should, FuzzyTermQuery { term: Term(type=Str, field=1, \"abc\"), \
"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
\"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
distance: 2, transposition_cost_one: false, prefix: true })] }"
);
}

View File

@@ -85,7 +85,7 @@ impl RangeQuery {
left_bound: &Bound<Term>,
right_bound: &Bound<Term>,
) -> RangeQuery {
let verify_and_unwrap_term = |val: &Term| val.value_bytes().to_owned();
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
RangeQuery {
field,
value_type,
@@ -121,7 +121,7 @@ impl RangeQuery {
) -> RangeQuery {
let make_term_val = |val: &i64| {
Term::from_field_i64(Field::from_field_id(0), *val)
.value_bytes()
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
@@ -159,7 +159,7 @@ impl RangeQuery {
) -> RangeQuery {
let make_term_val = |val: &f64| {
Term::from_field_f64(Field::from_field_id(0), *val)
.value_bytes()
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
@@ -185,7 +185,7 @@ impl RangeQuery {
) -> RangeQuery {
let make_term_val = |val: &u64| {
Term::from_field_u64(Field::from_field_id(0), *val)
.value_bytes()
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
@@ -208,7 +208,7 @@ impl RangeQuery {
) -> RangeQuery {
let make_term_val = |val: &Ipv6Addr| {
Term::from_field_ip_addr(Field::from_field_id(0), *val)
.value_bytes()
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
@@ -246,7 +246,7 @@ impl RangeQuery {
) -> RangeQuery {
let make_term_val = |val: &DateTime| {
Term::from_field_date(Field::from_field_id(0), *val)
.value_bytes()
.serialized_value_bytes()
.to_owned()
};
RangeQuery {

View File

@@ -47,8 +47,12 @@ impl TermSetQuery {
// In practice this won't fail because:
// - we are writing to memory, so no IoError
// - Terms are ordered
let map = Map::from_iter(sorted_terms.iter().map(|key| (key.value_bytes(), 0)))
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
let map = Map::from_iter(
sorted_terms
.iter()
.map(|key| (key.serialized_value_bytes(), 0)),
)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
sub_queries.push((
Occur::Should,

View File

@@ -175,7 +175,7 @@ mod tests {
);
assert_eq!(
format!("{:?}", term_query),
r#"TermQuery(Term(type=Str, field=1, "hello"))"#
r#"TermQuery(Term(field=1, type=Str, "hello"))"#
);
}

View File

@@ -5,6 +5,9 @@ use serde::{Deserialize, Serialize};
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
/// The precision of the indexed date/time values in the inverted index.
pub const DATE_TIME_PRECISION_INDEXED: DatePrecision = DatePrecision::Seconds;
/// Defines how DateTime field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct DateOptions {
@@ -85,7 +88,8 @@ impl DateOptions {
self
}
/// Sets the precision for this DateTime field.
/// Sets the precision for this DateTime field on the fast field.
/// Indexed precision is always [`DATE_TIME_PRECISION_INDEXED`].
///
/// Internal storage precision, used to optimize storage
/// compression on fast fields.

View File

@@ -129,7 +129,7 @@ mod value;
use columnar::ColumnType;
pub use self::bytes_options::BytesOptions;
pub use self::date_time_options::{DateOptions, DatePrecision};
pub use self::date_time_options::{DateOptions, DatePrecision, DATE_TIME_PRECISION_INDEXED};
pub use self::document::Document;
pub(crate) use self::facet::FACET_SEP_BYTE;
pub use self::facet::{Facet, FacetParseError};
@@ -147,7 +147,7 @@ pub use self::named_field_document::NamedFieldDocument;
pub use self::numeric_options::IntOptions;
pub use self::numeric_options::NumericOptions;
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
pub use self::term::Term;
pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
pub use self::value::Value;

View File

@@ -5,10 +5,11 @@ use std::{fmt, str};
use columnar::MonotonicallyMappableToU128;
use super::date_time_options::DATE_TIME_PRECISION_INDEXED;
use super::Field;
use crate::fastfield::FastValue;
use crate::schema::{Facet, Type};
use crate::{DatePrecision, DateTime};
use crate::DateTime;
/// Separates the different segments of a json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
@@ -20,8 +21,12 @@ pub const JSON_PATH_SEGMENT_SEP_STR: &str =
pub const JSON_END_OF_PATH: u8 = 0u8;
/// Term represents the value that the token can take.
/// It's a serialized representation over different types.
///
/// It actually wraps a `Vec<u8>`.
/// It actually wraps a `Vec<u8>`. The first 5 bytes are metadata.
/// 4 bytes are the field id, and the last byte is the type.
///
/// The serialized value `ValueBytes` is considered everything after the 4 first bytes (term id).
#[derive(Clone)]
pub struct Term<B = Vec<u8>>(B)
where B: AsRef<[u8]>;
@@ -100,7 +105,7 @@ impl Term {
/// Builds a term given a field, and a `DateTime` value
pub fn from_field_date(field: Field, val: DateTime) -> Term {
Term::from_fast_value(field, &val.truncate(DatePrecision::Seconds))
Term::from_fast_value(field, &val.truncate(DATE_TIME_PRECISION_INDEXED))
}
/// Creates a `Term` given a facet.
@@ -186,11 +191,6 @@ impl Term {
self.0.truncate(len + TERM_METADATA_LENGTH);
}
/// Returns the value bytes as mutable slice
pub fn value_bytes_mut(&mut self) -> &mut [u8] {
&mut self.0[TERM_METADATA_LENGTH..]
}
/// The length of the bytes.
pub fn len_bytes(&self) -> usize {
self.0.len() - TERM_METADATA_LENGTH
@@ -206,44 +206,25 @@ impl Term {
&mut self.0[len_before..]
}
/// Appends a single byte to the term.
/// Appends a JSON_PATH_SEGMENT_SEP to the term.
/// Only used for JSON type.
#[inline]
pub fn push_byte(&mut self, byte: u8) {
self.0.push(byte);
pub fn add_json_path_separator(&mut self) {
self.0.push(JSON_PATH_SEGMENT_SEP);
}
}
impl<B> Ord for Term<B>
where B: AsRef<[u8]>
{
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.as_slice().cmp(other.as_slice())
/// Sets the current end to JSON_END_OF_PATH.
/// Only used for JSON type.
#[inline]
pub fn set_json_path_end(&mut self) {
let buffer_len = self.0.len();
self.0[buffer_len - 1] = JSON_END_OF_PATH;
}
}
impl<B> PartialOrd for Term<B>
where B: AsRef<[u8]>
{
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl<B> PartialEq for Term<B>
where B: AsRef<[u8]>
{
fn eq(&self, other: &Self) -> bool {
self.as_slice() == other.as_slice()
}
}
impl<B> Eq for Term<B> where B: AsRef<[u8]> {}
impl<B> Hash for Term<B>
where B: AsRef<[u8]>
{
fn hash<H: Hasher>(&self, state: &mut H) {
self.0.as_ref().hash(state)
/// Sets the current end to JSON_PATH_SEGMENT_SEP.
/// Only used for JSON type.
#[inline]
pub fn set_json_path_separator(&mut self) {
let buffer_len = self.0.len();
self.0[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
}
}
@@ -255,11 +236,68 @@ where B: AsRef<[u8]>
Term(data)
}
/// Return the type of the term.
pub fn typ(&self) -> Type {
self.value().typ()
}
/// Returns the field.
pub fn field(&self) -> Field {
let field_id_bytes: [u8; 4] = (&self.0.as_ref()[..4]).try_into().unwrap();
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
}
/// Returns the serialized representation of the value.
/// (this does neither include the field id nor the value type.)
///
/// If the term is a string, its value is utf-8 encoded.
/// If the term is a u64, its value is encoded according
/// to `byteorder::BigEndian`.
pub fn serialized_value_bytes(&self) -> &[u8] {
&self.0.as_ref()[TERM_METADATA_LENGTH..]
}
/// Returns the value of the term.
/// address or JSON path + value. (this does not include the field.)
pub fn value(&self) -> ValueBytes<&[u8]> {
ValueBytes::wrap(&self.0.as_ref()[4..])
}
/// Returns the serialized representation of Term.
/// This includes field_id, value type and value.
///
/// Do NOT rely on this byte representation in the index.
/// This value is likely to change in the future.
pub fn serialized_term(&self) -> &[u8] {
self.0.as_ref()
}
}
/// ValueBytes represents a serialized value.
/// The value can be of any type of [`Type`] (e.g. string, u64, f64, bool, date, JSON).
/// The serialized representation matches the lexographical order of the type.
///
/// The `ValueBytes` format is as follow:
/// `[type code: u8][serialized value]`
///
/// For JSON `ValueBytes` equals to:
/// `[type code=JSON][JSON path][JSON_END_OF_PATH][ValueBytes]`
///
/// The nested ValueBytes in JSON is never of type JSON. (there's no recursion)
#[derive(Clone)]
pub struct ValueBytes<B>(B)
where B: AsRef<[u8]>;
impl<B> ValueBytes<B>
where B: AsRef<[u8]>
{
/// Wraps a object holding bytes
pub fn wrap(data: B) -> ValueBytes<B> {
ValueBytes(data)
}
fn typ_code(&self) -> u8 {
*self
.as_slice()
.get(4)
.expect("the byte representation is too short")
self.0.as_ref()[0]
}
/// Return the type of the term.
@@ -267,13 +305,6 @@ where B: AsRef<[u8]>
Type::from_code(self.typ_code()).expect("The term has an invalid type code")
}
/// Returns the field.
pub fn field(&self) -> Field {
let mut field_id_bytes = [0u8; 4];
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
}
/// Returns the `u64` value stored in a term.
///
/// Returns `None` if the term is not of the u64 type, or if the term byte representation
@@ -286,13 +317,8 @@ where B: AsRef<[u8]>
if self.typ() != T::to_type() {
return None;
}
let mut value_bytes = [0u8; 8];
let bytes = self.value_bytes();
if bytes.len() != 8 {
return None;
}
value_bytes.copy_from_slice(self.value_bytes());
let value_u64 = u64::from_be_bytes(value_bytes);
let value_bytes = self.value_bytes();
let value_u64 = u64::from_be_bytes(value_bytes.try_into().ok()?);
Some(T::from_u64(value_u64))
}
@@ -361,23 +387,133 @@ where B: AsRef<[u8]>
Some(self.value_bytes())
}
/// Returns the serialized value of the term.
/// (this does not include the field.)
///
/// If the term is a string, its value is utf-8 encoded.
/// If the term is a u64, its value is encoded according
/// to `byteorder::BigEndian`.
pub fn value_bytes(&self) -> &[u8] {
&self.0.as_ref()[TERM_METADATA_LENGTH..]
/// Returns a `Ipv6Addr` value from the term.
pub fn as_ip_addr(&self) -> Option<Ipv6Addr> {
if self.typ() != Type::IpAddr {
return None;
}
let ip_u128 = u128::from_be_bytes(self.value_bytes().try_into().ok()?);
Some(Ipv6Addr::from_u128(ip_u128))
}
/// Returns the underlying `&[u8]`.
/// Returns the json path (without non-human friendly separators),
/// and the encoded ValueBytes after the json path.
///
/// Returns `None` if the value is not JSON.
pub(crate) fn as_json(&self) -> Option<(&str, ValueBytes<&[u8]>)> {
if self.typ() != Type::Json {
return None;
}
let bytes = self.value_bytes();
let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?;
let (json_path_bytes, term) = bytes.split_at(pos);
let json_path = str::from_utf8(json_path_bytes).ok()?;
Some((json_path, ValueBytes::wrap(&term[1..])))
}
/// Returns the encoded ValueBytes after the json path.
///
/// Returns `None` if the value is not JSON.
pub(crate) fn as_json_value_bytes(&self) -> Option<ValueBytes<&[u8]>> {
if self.typ() != Type::Json {
return None;
}
let bytes = self.value_bytes();
let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?;
Some(ValueBytes::wrap(&bytes[pos + 1..]))
}
/// Returns the serialized value of ValueBytes without the type.
fn value_bytes(&self) -> &[u8] {
&self.0.as_ref()[1..]
}
/// Returns the serialized representation of Term.
///
/// Do NOT rely on this byte representation in the index.
/// This value is likely to change in the future.
pub fn as_slice(&self) -> &[u8] {
pub fn as_serialized(&self) -> &[u8] {
self.0.as_ref()
}
fn debug_value_bytes(&self, f: &mut fmt::Formatter) -> fmt::Result {
let typ = self.typ();
write!(f, "type={typ:?}, ")?;
match typ {
Type::Str => {
let s = self.as_str();
write_opt(f, s)?;
}
Type::U64 => {
write_opt(f, self.as_u64())?;
}
Type::I64 => {
write_opt(f, self.as_i64())?;
}
Type::F64 => {
write_opt(f, self.as_f64())?;
}
Type::Bool => {
write_opt(f, self.as_bool())?;
}
// TODO pretty print these types too.
Type::Date => {
write_opt(f, self.as_date())?;
}
Type::Facet => {
write_opt(f, self.as_facet())?;
}
Type::Bytes => {
write_opt(f, self.as_bytes())?;
}
Type::Json => {
if let Some((path, sub_value_bytes)) = self.as_json() {
let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, ".");
write!(f, "path={path_pretty}, ")?;
sub_value_bytes.debug_value_bytes(f)?;
}
}
Type::IpAddr => {
write_opt(f, self.as_ip_addr())?;
}
}
Ok(())
}
}
impl<B> Ord for Term<B>
where B: AsRef<[u8]>
{
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.serialized_term().cmp(other.serialized_term())
}
}
impl<B> PartialOrd for Term<B>
where B: AsRef<[u8]>
{
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl<B> PartialEq for Term<B>
where B: AsRef<[u8]>
{
fn eq(&self, other: &Self) -> bool {
self.serialized_term() == other.serialized_term()
}
}
impl<B> Eq for Term<B> where B: AsRef<[u8]> {}
impl<B> Hash for Term<B>
where B: AsRef<[u8]>
{
fn hash<H: Hasher>(&self, state: &mut H) {
self.0.as_ref().hash(state)
}
}
fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) -> fmt::Result {
@@ -387,80 +523,14 @@ fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) ->
Ok(())
}
fn as_str(value_bytes: &[u8]) -> Option<&str> {
std::str::from_utf8(value_bytes).ok()
}
fn get_fast_type<T: FastValue>(bytes: &[u8]) -> Option<T> {
let value_u64 = u64::from_be_bytes(bytes.try_into().ok()?);
Some(T::from_u64(value_u64))
}
/// Returns the json path (without non-human friendly separators, the type of the value, and the
/// value bytes). Returns `None` if the value is not JSON or is not valid.
pub(crate) fn as_json_path_type_value_bytes(bytes: &[u8]) -> Option<(&str, Type, &[u8])> {
let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?;
let json_path = str::from_utf8(&bytes[..pos]).ok()?;
let type_code = *bytes.get(pos + 1)?;
let typ = Type::from_code(type_code)?;
Some((json_path, typ, &bytes[pos + 2..]))
}
fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Result {
match typ {
Type::Str => {
let s = as_str(bytes);
write_opt(f, s)?;
}
Type::U64 => {
write_opt(f, get_fast_type::<u64>(bytes))?;
}
Type::I64 => {
write_opt(f, get_fast_type::<i64>(bytes))?;
}
Type::F64 => {
write_opt(f, get_fast_type::<f64>(bytes))?;
}
Type::Bool => {
write_opt(f, get_fast_type::<bool>(bytes))?;
}
// TODO pretty print these types too.
Type::Date => {
write_opt(f, get_fast_type::<DateTime>(bytes))?;
}
Type::Facet => {
let facet_str = str::from_utf8(bytes)
.ok()
.map(ToString::to_string)
.map(Facet::from_encoded_string)
.map(|facet| facet.to_path_string());
write_opt(f, facet_str)?;
}
Type::Bytes => {
write_opt(f, Some(bytes))?;
}
Type::Json => {
if let Some((path, typ, bytes)) = as_json_path_type_value_bytes(bytes) {
let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, ".");
write!(f, "path={path_pretty}, vtype={typ:?}, ")?;
debug_value_bytes(typ, bytes, f)?;
}
}
Type::IpAddr => {
write!(f, "")?; // TODO change once we actually have IP address terms.
}
}
Ok(())
}
impl<B> fmt::Debug for Term<B>
where B: AsRef<[u8]>
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let field_id = self.field().field_id();
let typ = self.typ();
write!(f, "Term(type={typ:?}, field={field_id}, ")?;
debug_value_bytes(typ, self.value_bytes(), f)?;
write!(f, "Term(field={field_id}, ")?;
let value_bytes = ValueBytes::wrap(&self.0.as_ref()[4..]);
value_bytes.debug_value_bytes(f)?;
write!(f, ")",)?;
Ok(())
}
@@ -479,7 +549,7 @@ mod tests {
let term = Term::from_field_text(title_field, "test");
assert_eq!(term.field(), title_field);
assert_eq!(term.typ(), Type::Str);
assert_eq!(term.as_str(), Some("test"))
assert_eq!(term.value().as_str(), Some("test"))
}
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
@@ -501,8 +571,8 @@ mod tests {
let term = Term::from_field_u64(count_field, 983u64);
assert_eq!(term.field(), count_field);
assert_eq!(term.typ(), Type::U64);
assert_eq!(term.as_slice().len(), FAST_VALUE_TERM_LEN);
assert_eq!(term.as_u64(), Some(983u64))
assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN);
assert_eq!(term.value().as_u64(), Some(983u64))
}
#[test]
@@ -512,7 +582,7 @@ mod tests {
let term = Term::from_field_bool(bool_field, true);
assert_eq!(term.field(), bool_field);
assert_eq!(term.typ(), Type::Bool);
assert_eq!(term.as_slice().len(), FAST_VALUE_TERM_LEN);
assert_eq!(term.as_bool(), Some(true))
assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN);
assert_eq!(term.value().as_bool(), Some(true))
}
}

View File

@@ -310,7 +310,8 @@ impl SnippetGenerator {
});
let mut terms_text: BTreeMap<String, Score> = Default::default();
for term in terms {
let term_str = if let Some(term_str) = term.as_str() {
let term_value = term.value();
let term_str = if let Some(term_str) = term_value.as_str() {
term_str
} else {
continue;