Merge pull request #1614 from quickwit-oss/remove_superfluous_steps

refactor Term
This commit is contained in:
PSeitz
2022-10-17 18:25:31 +08:00
committed by GitHub
5 changed files with 117 additions and 112 deletions

View File

@@ -242,10 +242,12 @@ pub(crate) fn set_string_and_get_terms(
) -> Vec<(usize, Term)> {
let mut positions_and_terms = Vec::<(usize, Term)>::new();
json_term_writer.close_path_and_set_type(Type::Str);
let term_num_bytes = json_term_writer.term_buffer.as_slice().len();
let term_num_bytes = json_term_writer.term_buffer.len_bytes();
let mut token_stream = text_analyzer.token_stream(value);
token_stream.process(&mut |token| {
json_term_writer.term_buffer.truncate(term_num_bytes);
json_term_writer
.term_buffer
.truncate_value_bytes(term_num_bytes);
json_term_writer
.term_buffer
.append_bytes(token.text.as_bytes());
@@ -265,7 +267,7 @@ impl<'a> JsonTermWriter<'a> {
json_path: &str,
term_buffer: &'a mut Term,
) -> Self {
term_buffer.set_field(Type::Json, field);
term_buffer.set_field_and_type(field, Type::Json);
let mut json_term_writer = Self::wrap(term_buffer);
for segment in json_path.split('.') {
json_term_writer.push_path_segment(segment);
@@ -276,7 +278,7 @@ impl<'a> JsonTermWriter<'a> {
pub fn wrap(term_buffer: &'a mut Term) -> Self {
term_buffer.clear_with_type(Type::Json);
let mut path_stack = Vec::with_capacity(10);
path_stack.push(5);
path_stack.push(0);
Self {
term_buffer,
path_stack,
@@ -285,28 +287,28 @@ impl<'a> JsonTermWriter<'a> {
fn trim_to_end_of_path(&mut self) {
let end_of_path = *self.path_stack.last().unwrap();
self.term_buffer.truncate(end_of_path);
self.term_buffer.truncate_value_bytes(end_of_path);
}
pub fn close_path_and_set_type(&mut self, typ: Type) {
self.trim_to_end_of_path();
let buffer = self.term_buffer.as_mut();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();
buffer[buffer_len - 1] = JSON_END_OF_PATH;
buffer.push(typ.to_code());
self.term_buffer.append_bytes(&[typ.to_code()]);
}
pub fn push_path_segment(&mut self, segment: &str) {
// the path stack should never be empty.
self.trim_to_end_of_path();
let buffer = self.term_buffer.as_mut();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();
if self.path_stack.len() > 1 {
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
}
buffer.extend(segment.as_bytes());
buffer.push(JSON_PATH_SEGMENT_SEP);
self.path_stack.push(buffer.len());
self.term_buffer.append_bytes(segment.as_bytes());
self.term_buffer.append_bytes(&[JSON_PATH_SEGMENT_SEP]);
self.path_stack.push(self.term_buffer.len_bytes());
}
pub fn pop_path_segment(&mut self) {
@@ -318,8 +320,8 @@ impl<'a> JsonTermWriter<'a> {
/// Returns the json path of the term being currently built.
#[cfg(test)]
pub(crate) fn path(&self) -> &[u8] {
let end_of_path = self.path_stack.last().cloned().unwrap_or(6);
&self.term().as_slice()[5..end_of_path - 1]
let end_of_path = self.path_stack.last().cloned().unwrap_or(1);
&self.term().value_bytes()[..end_of_path - 1]
}
pub fn set_fast_value<T: FastValue>(&mut self, val: T) {
@@ -332,14 +334,13 @@ impl<'a> JsonTermWriter<'a> {
val.to_u64()
};
self.term_buffer
.as_mut()
.extend_from_slice(value.to_be_bytes().as_slice());
.append_bytes(value.to_be_bytes().as_slice());
}
#[cfg(test)]
pub(crate) fn set_str(&mut self, text: &str) {
self.close_path_and_set_type(Type::Str);
self.term_buffer.as_mut().extend_from_slice(text.as_bytes());
self.term_buffer.append_bytes(text.as_bytes());
}
pub fn term(&self) -> &Term {
@@ -356,8 +357,7 @@ mod tests {
#[test]
fn test_json_writer() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("attributes");
json_writer.push_path_segment("color");
@@ -391,8 +391,7 @@ mod tests {
#[test]
fn test_string_term() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_str("red");
@@ -405,8 +404,7 @@ mod tests {
#[test]
fn test_i64_term() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_fast_value(-4i64);
@@ -419,8 +417,7 @@ mod tests {
#[test]
fn test_u64_term() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_fast_value(4u64);
@@ -433,8 +430,7 @@ mod tests {
#[test]
fn test_f64_term() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_fast_value(4.0f64);
@@ -447,8 +443,7 @@ mod tests {
#[test]
fn test_bool_term() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_fast_value(true);
@@ -461,8 +456,7 @@ mod tests {
#[test]
fn test_push_after_set_path_segment() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("attribute");
json_writer.set_str("something");
@@ -477,8 +471,7 @@ mod tests {
#[test]
fn test_pop_segment() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.push_path_segment("hue");
@@ -493,8 +486,7 @@ mod tests {
#[test]
fn test_json_writer_path() {
let field = Field::from_field_id(1);
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
assert_eq!(json_writer.path(), b"color");

View File

@@ -114,7 +114,7 @@ impl SegmentWriter {
fast_field_writers: FastFieldsWriter::from_schema(&schema),
doc_opstamps: Vec::with_capacity(1_000),
per_field_text_analyzers,
term_buffer: Term::new(),
term_buffer: Term::with_capacity(16),
schema,
})
}
@@ -178,7 +178,7 @@ impl SegmentWriter {
let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
let postings_writer: &mut dyn PostingsWriter =
self.per_field_postings_writers.get_for_field_mut(field);
term_buffer.set_field(field_entry.field_type().value_type(), field);
term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
match *field_entry.field_type() {
FieldType::Facet(_) => {
@@ -220,7 +220,7 @@ impl SegmentWriter {
}
};
assert_eq!(term_buffer.as_slice().len(), 5);
assert!(term_buffer.is_empty());
postings_writer.index_text(
doc_id,
&mut *token_stream,
@@ -543,8 +543,7 @@ mod tests {
let inv_idx = segment_reader.inverted_index(json_field).unwrap();
let term_dict = inv_idx.terms();
let mut term = Term::new();
term.set_field(Type::Json, json_field);
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut term_stream = term_dict.stream().unwrap();
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
@@ -637,8 +636,7 @@ mod tests {
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inv_index = segment_reader.inverted_index(json_field).unwrap();
let mut term = Term::new();
term.set_field(Type::Json, json_field);
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
json_term_writer.push_path_segment("mykey");
json_term_writer.set_str("token");
@@ -682,8 +680,7 @@ mod tests {
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inv_index = segment_reader.inverted_index(json_field).unwrap();
let mut term = Term::new();
term.set_field(Type::Json, json_field);
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
json_term_writer.push_path_segment("mykey");
json_term_writer.set_str("two tokens");
@@ -728,8 +725,7 @@ mod tests {
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let mut term = Term::new();
term.set_field(Type::Json, json_field);
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
json_term_writer.push_path_segment("mykey");
json_term_writer.push_path_segment("field");

View File

@@ -153,7 +153,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
indexing_position: &mut IndexingPosition,
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
) {
let end_of_path_idx = term_buffer.as_slice().len();
let end_of_path_idx = term_buffer.len_bytes();
let mut num_tokens = 0;
let mut end_position = 0;
token_stream.process(&mut |token: &Token| {
@@ -167,7 +167,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
);
return;
}
term_buffer.truncate(end_of_path_idx);
term_buffer.truncate_value_bytes(end_of_path_idx);
term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32;
end_position = start_position + token.position_length as u32;
@@ -181,7 +181,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
indexing_position.end_position = end_position + POSITION_GAP;
indexing_position.num_tokens += num_tokens;
term_buffer.truncate(end_of_path_idx);
term_buffer.truncate_value_bytes(end_of_path_idx);
}
fn total_num_tokens(&self) -> u64;

View File

@@ -734,7 +734,7 @@ fn generate_literals_for_json_object(
index_record_option: IndexRecordOption,
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
let mut logical_literals = Vec::new();
let mut term = Term::new();
let mut term = Term::with_capacity(100);
let mut json_term_writer =
JsonTermWriter::from_field_and_json_path(field, json_path, &mut term);
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {

View File

@@ -7,18 +7,6 @@ use crate::fastfield::FastValue;
use crate::schema::{Facet, Type};
use crate::{DatePrecision, DateTime};
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
/// <field> + <type byte> + <value len>
///
/// - <field> is a big endian encoded u32 field id
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not
/// The remaining 7 bits are used to encode the type of the value.
/// If this is a JSON term, the type is the type of the leaf of the json.
///
/// - <value> is, if this is not the json term, a binary representation specific to the type.
/// If it is a JSON Term, then it is prepended with the path that leads to this leaf value.
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
/// Separates the different segments of
/// the json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
@@ -36,24 +24,50 @@ pub const JSON_END_OF_PATH: u8 = 0u8;
pub struct Term<B = Vec<u8>>(B)
where B: AsRef<[u8]>;
impl AsMut<Vec<u8>> for Term {
fn as_mut(&mut self) -> &mut Vec<u8> {
&mut self.0
}
}
/// The number of bytes used as metadata by `Term`.
const TERM_METADATA_LENGTH: usize = 5;
impl Term {
pub(crate) fn new() -> Term {
Term(Vec::with_capacity(100))
pub(crate) fn with_capacity(capacity: usize) -> Term {
let mut data = Vec::with_capacity(TERM_METADATA_LENGTH + capacity);
data.resize(TERM_METADATA_LENGTH, 0u8);
Term(data)
}
pub(crate) fn with_type_and_field(typ: Type, field: Field) -> Term {
let mut term = Self::with_capacity(8);
term.set_field_and_type(field, typ);
term
}
fn with_bytes_and_field_and_payload(typ: Type, field: Field, bytes: &[u8]) -> Term {
let mut term = Self::with_capacity(bytes.len());
term.set_field_and_type(field, typ);
term.0.extend_from_slice(bytes);
term
}
fn from_fast_value<T: FastValue>(field: Field, val: &T) -> Term {
let mut term = Term(vec![0u8; FAST_VALUE_TERM_LEN]);
term.set_field(T::to_type(), field);
let mut term = Self::with_type_and_field(T::to_type(), field);
term.set_u64(val.to_u64());
term
}
/// Panics when the term is not empty... ie: some value is set.
/// Use `clear_with_field_and_type` in that case.
///
/// Sets field and the type.
pub(crate) fn set_field_and_type(&mut self, field: Field, typ: Type) {
assert!(self.is_empty());
self.0[0..4].clone_from_slice(field.field_id().to_be_bytes().as_ref());
self.0[4] = typ.to_code();
}
/// Is empty if there are no value bytes.
pub fn is_empty(&self) -> bool {
self.0.len() == TERM_METADATA_LENGTH
}
/// Builds a term given a field, and a `u64`-value
pub fn from_field_u64(field: Field, val: u64) -> Term {
Term::from_fast_value(field, &val)
@@ -82,31 +96,29 @@ impl Term {
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let facet_encoded_str = facet.encoded_str();
Term::create_bytes_term(Type::Facet, field, facet_encoded_str.as_bytes())
Term::with_bytes_and_field_and_payload(Type::Facet, field, facet_encoded_str.as_bytes())
}
/// Builds a term given a field, and a string value
pub fn from_field_text(field: Field, text: &str) -> Term {
Term::create_bytes_term(Type::Str, field, text.as_bytes())
}
fn create_bytes_term(typ: Type, field: Field, bytes: &[u8]) -> Term {
let mut term = Term(vec![0u8; 5 + bytes.len()]);
term.set_field(typ, field);
term.0.extend_from_slice(bytes);
term
Term::with_bytes_and_field_and_payload(Type::Str, field, text.as_bytes())
}
/// Builds a term bytes.
pub fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
Term::create_bytes_term(Type::Bytes, field, bytes)
Term::with_bytes_and_field_and_payload(Type::Bytes, field, bytes)
}
pub(crate) fn set_field(&mut self, typ: Type, field: Field) {
self.0.clear();
self.0
.extend_from_slice(field.field_id().to_be_bytes().as_ref());
self.0.push(typ.to_code());
/// Removes the value_bytes and set the field and type code.
pub(crate) fn clear_with_field_and_type(&mut self, typ: Type, field: Field) {
self.truncate_value_bytes(0);
self.set_field_and_type(field, typ);
}
/// Removes the value_bytes and set the type code.
pub fn clear_with_type(&mut self, typ: Type) {
self.truncate_value_bytes(0);
self.0[4] = typ.to_code();
}
/// Sets a u64 value in the term.
@@ -117,12 +129,6 @@ impl Term {
/// the natural order of the values.
pub fn set_u64(&mut self, val: u64) {
self.set_fast_value(val);
self.set_bytes(val.to_be_bytes().as_ref());
}
fn set_fast_value<T: FastValue>(&mut self, val: T) {
self.0.resize(FAST_VALUE_TERM_LEN, 0u8);
self.set_bytes(val.to_u64().to_be_bytes().as_ref());
}
/// Sets a `i64` value in the term.
@@ -145,9 +151,13 @@ impl Term {
self.set_fast_value(val);
}
fn set_fast_value<T: FastValue>(&mut self, val: T) {
self.set_bytes(val.to_u64().to_be_bytes().as_ref());
}
/// Sets the value of a `Bytes` field.
pub fn set_bytes(&mut self, bytes: &[u8]) {
self.0.resize(5, 0u8);
self.truncate_value_bytes(0);
self.0.extend(bytes);
}
@@ -156,18 +166,22 @@ impl Term {
self.set_bytes(text.as_bytes());
}
/// Removes the value_bytes and set the type code.
pub fn clear_with_type(&mut self, typ: Type) {
self.truncate(5);
self.0[4] = typ.to_code();
/// Truncates the value bytes of the term. Value and field type stays the same.
pub fn truncate_value_bytes(&mut self, len: usize) {
self.0.truncate(len + TERM_METADATA_LENGTH);
}
/// Truncate the term right after the field and the type code.
pub fn truncate(&mut self, len: usize) {
self.0.truncate(len);
/// Returns the value bytes as mutable slice
pub fn value_bytes_mut(&mut self) -> &mut [u8] {
&mut self.0[TERM_METADATA_LENGTH..]
}
/// Truncate the term right after the field and the type code.
/// The length of the bytes.
pub fn len_bytes(&self) -> usize {
self.0.len() - TERM_METADATA_LENGTH
}
/// Appends value bytes to the Term.
pub fn append_bytes(&mut self, bytes: &[u8]) {
self.0.extend_from_slice(bytes);
}
@@ -293,9 +307,6 @@ where B: AsRef<[u8]>
/// Returns `None` if the field is not of string type
/// or if the bytes are not valid utf-8.
pub fn as_str(&self) -> Option<&str> {
if self.as_slice().len() < 5 {
return None;
}
if self.typ() != Type::Str {
return None;
}
@@ -307,9 +318,6 @@ where B: AsRef<[u8]>
/// Returns `None` if the field is not of facet type
/// or if the bytes are not valid utf-8.
pub fn as_facet(&self) -> Option<Facet> {
if self.as_slice().len() < 5 {
return None;
}
if self.typ() != Type::Facet {
return None;
}
@@ -321,9 +329,6 @@ where B: AsRef<[u8]>
///
/// Returns `None` if the field is not of bytes type.
pub fn as_bytes(&self) -> Option<&[u8]> {
if self.as_slice().len() < 5 {
return None;
}
if self.typ() != Type::Bytes {
return None;
}
@@ -337,7 +342,7 @@ where B: AsRef<[u8]>
/// If the term is a u64, its value is encoded according
/// to `byteorder::LittleEndian`.
pub fn value_bytes(&self) -> &[u8] {
&self.0.as_ref()[5..]
&self.0.as_ref()[TERM_METADATA_LENGTH..]
}
/// Returns the underlying `&[u8]`.
@@ -451,6 +456,18 @@ mod tests {
assert_eq!(term.as_str(), Some("test"))
}
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
/// <field> + <type byte> + <value len>
///
/// - <field> is a big endian encoded u32 field id
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not
/// The remaining 7 bits are used to encode the type of the value.
/// If this is a JSON term, the type is the type of the leaf of the json.
///
/// - <value> is, if this is not the json term, a binary representation specific to the type.
/// If it is a JSON Term, then it is prepended with the path that leads to this leaf value.
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
#[test]
pub fn test_term_u64() {
let mut schema_builder = Schema::builder();
@@ -458,7 +475,7 @@ mod tests {
let term = Term::from_field_u64(count_field, 983u64);
assert_eq!(term.field(), count_field);
assert_eq!(term.typ(), Type::U64);
assert_eq!(term.as_slice().len(), super::FAST_VALUE_TERM_LEN);
assert_eq!(term.as_slice().len(), FAST_VALUE_TERM_LEN);
assert_eq!(term.as_u64(), Some(983u64))
}
@@ -469,7 +486,7 @@ mod tests {
let term = Term::from_field_bool(bool_field, true);
assert_eq!(term.field(), bool_field);
assert_eq!(term.typ(), Type::Bool);
assert_eq!(term.as_slice().len(), super::FAST_VALUE_TERM_LEN);
assert_eq!(term.as_slice().len(), FAST_VALUE_TERM_LEN);
assert_eq!(term.as_bool(), Some(true))
}
}