From 07bf66a19728247865b37ad6cc514f450157d900 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 24 Oct 2023 09:45:50 +0200 Subject: [PATCH] json path writer (#2224) * refactor logic to JsonPathWriter * use in encode_column_name * add inlines * move unsafe block --- common/src/json_path_writer.rs | 112 +++++++++++++++++++++++++++++++++ common/src/lib.rs | 2 + src/core/json_utils.rs | 20 +++--- src/fastfield/writer.rs | 48 +++++--------- 4 files changed, 137 insertions(+), 45 deletions(-) create mode 100644 common/src/json_path_writer.rs diff --git a/common/src/json_path_writer.rs b/common/src/json_path_writer.rs new file mode 100644 index 000000000..43a5da8eb --- /dev/null +++ b/common/src/json_path_writer.rs @@ -0,0 +1,112 @@ +use crate::replace_in_place; + +/// Separates the different segments of a json path. +pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8; +pub const JSON_PATH_SEGMENT_SEP_STR: &str = + unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) }; + +/// Create a new JsonPathWriter, that creates flattened json paths for tantivy. +#[derive(Clone, Debug, Default)] +pub struct JsonPathWriter { + path: String, + indices: Vec, + expand_dots: bool, +} + +impl JsonPathWriter { + pub fn new() -> Self { + JsonPathWriter { + path: String::new(), + indices: Vec::new(), + expand_dots: false, + } + } + + /// When expand_dots is enabled, json object like + /// `{"k8s.node.id": 5}` is processed as if it was + /// `{"k8s": {"node": {"id": 5}}}`. + /// This option has the merit of allowing users to + /// write queries like `k8s.node.id:5`. + /// On the other, enabling that feature can lead to + /// ambiguity. + #[inline] + pub fn set_expand_dots(&mut self, expand_dots: bool) { + self.expand_dots = expand_dots; + } + + /// Push a new segment to the path. + #[inline] + pub fn push(&mut self, segment: &str) { + let len_path = self.path.len(); + self.indices.push(len_path); + if !self.path.is_empty() { + self.path.push_str(JSON_PATH_SEGMENT_SEP_STR); + } + self.path.push_str(segment); + if self.expand_dots { + // This might include the separation byte, which is ok because it is not a dot. + let appended_segment = &mut self.path[len_path..]; + // The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are + // valid single byte ut8 strings. + // By utf-8 design, they cannot be part of another codepoint. + unsafe { + replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment.as_bytes_mut()) + }; + } + } + + /// Remove the last segment. Does nothing if the path is empty. + #[inline] + pub fn pop(&mut self) { + if let Some(last_idx) = self.indices.pop() { + self.path.truncate(last_idx); + } + } + + /// Clear the path. + #[inline] + pub fn clear(&mut self) { + self.path.clear(); + self.indices.clear(); + } + + /// Get the current path. + #[inline] + pub fn as_str(&self) -> &str { + &self.path + } +} + +impl From for String { + #[inline] + fn from(value: JsonPathWriter) -> Self { + value.path + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn json_path_writer_test() { + let mut writer = JsonPathWriter::new(); + + writer.push("root"); + assert_eq!(writer.as_str(), "root"); + + writer.push("child"); + assert_eq!(writer.as_str(), "root\u{1}child"); + + writer.pop(); + assert_eq!(writer.as_str(), "root"); + + writer.push("k8s.node.id"); + assert_eq!(writer.as_str(), "root\u{1}k8s.node.id"); + + writer.set_expand_dots(true); + writer.pop(); + writer.push("k8s.node.id"); + assert_eq!(writer.as_str(), "root\u{1}k8s\u{1}node\u{1}id"); + } +} diff --git a/common/src/lib.rs b/common/src/lib.rs index 3ea1fedf1..9dcdc5a46 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -9,6 +9,7 @@ mod byte_count; mod datetime; pub mod file_slice; mod group_by; +mod json_path_writer; mod serialize; mod vint; mod writer; @@ -18,6 +19,7 @@ pub use byte_count::ByteCount; pub use datetime::DatePrecision; pub use datetime::{DateTime, DateTimePrecision}; pub use group_by::GroupByIteratorExtended; +pub use json_path_writer::JsonPathWriter; pub use ownedbytes::{OwnedBytes, StableDeref}; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use vint::{ diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 86fe08fd5..ae8db931a 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -1,12 +1,12 @@ use columnar::MonotonicallyMappableToU64; -use common::replace_in_place; +use common::{replace_in_place, JsonPathWriter}; use murmurhash32::murmurhash2; use rustc_hash::FxHashMap; use crate::fastfield::FastValue; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value}; -use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR}; +use crate::schema::term::JSON_PATH_SEGMENT_SEP; use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; @@ -315,17 +315,13 @@ pub(crate) fn encode_column_name( json_path: &str, expand_dots_enabled: bool, ) -> String { - let mut column_key: String = String::with_capacity(field_name.len() + json_path.len() + 1); - column_key.push_str(field_name); - for mut segment in split_json_path(json_path) { - column_key.push_str(JSON_PATH_SEGMENT_SEP_STR); - if expand_dots_enabled { - // We need to replace `.` by JSON_PATH_SEGMENT_SEP. - unsafe { replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, segment.as_bytes_mut()) }; - } - column_key.push_str(&segment); + let mut path = JsonPathWriter::default(); + path.push(field_name); + path.set_expand_dots(expand_dots_enabled); + for segment in split_json_path(json_path) { + path.push(&segment); } - column_key + path.into() } impl<'a> JsonTermWriter<'a> { diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 4ea6ea291..ca0da8145 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,12 +1,11 @@ use std::io; use columnar::{ColumnarWriter, NumericalValue}; -use common::replace_in_place; +use common::JsonPathWriter; use tokenizer_api::Token; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value}; -use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR}; use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type}; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::{DateTimePrecision, DocId, TantivyError}; @@ -24,7 +23,7 @@ pub struct FastFieldsWriter { expand_dots: Vec, num_docs: DocId, // Buffer that we recycle to avoid allocation. - json_path_buffer: String, + json_path_buffer: JsonPathWriter, } impl FastFieldsWriter { @@ -98,7 +97,7 @@ impl FastFieldsWriter { num_docs: 0u32, date_precisions, expand_dots, - json_path_buffer: String::new(), + json_path_buffer: JsonPathWriter::default(), }) } @@ -212,14 +211,16 @@ impl FastFieldsWriter { ReferenceValue::Object(val) => { let expand_dots = self.expand_dots[field.field_id() as usize]; self.json_path_buffer.clear(); - self.json_path_buffer.push_str(field_name); + // First field should not be expanded. + self.json_path_buffer.set_expand_dots(false); + self.json_path_buffer.push(field_name); + self.json_path_buffer.set_expand_dots(expand_dots); let text_analyzer = &mut self.per_field_tokenizer[field.field_id() as usize]; record_json_obj_to_columnar_writer::( doc_id, val, - expand_dots, JSON_DEPTH_LIMIT, &mut self.json_path_buffer, &mut self.columnar_writer, @@ -250,48 +251,30 @@ impl FastFieldsWriter { fn record_json_obj_to_columnar_writer<'a, V: Value<'a>>( doc: DocId, json_visitor: V::ObjectIter, - expand_dots: bool, remaining_depth_limit: usize, - json_path_buffer: &mut String, + json_path_buffer: &mut JsonPathWriter, columnar_writer: &mut columnar::ColumnarWriter, tokenizer: &mut Option, ) { for (key, child) in json_visitor { - let len_path = json_path_buffer.len(); - if !json_path_buffer.is_empty() { - json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR); - } - json_path_buffer.push_str(key); - if expand_dots { - // This might include the separation byte, which is ok because it is not a dot. - let appended_segment = &mut json_path_buffer[len_path..]; - // The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are - // valid single byte ut8 strings. - // By utf-8 design, they cannot be part of another codepoint. - replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, unsafe { - appended_segment.as_bytes_mut() - }); - } + json_path_buffer.push(key); record_json_value_to_columnar_writer( doc, child, - expand_dots, remaining_depth_limit, json_path_buffer, columnar_writer, tokenizer, ); - // popping our sub path. - json_path_buffer.truncate(len_path); + json_path_buffer.pop(); } } fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( doc: DocId, json_val: V, - expand_dots: bool, mut remaining_depth_limit: usize, - json_path_writer: &mut String, + json_path_writer: &mut JsonPathWriter, columnar_writer: &mut columnar::ColumnarWriter, tokenizer: &mut Option, ) { @@ -335,7 +318,7 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( ); } ReferenceValueLeaf::Bool(val) => { - columnar_writer.record_bool(doc, json_path_writer, val); + columnar_writer.record_bool(doc, json_path_writer.as_str(), val); } ReferenceValueLeaf::Date(val) => { columnar_writer.record_datetime(doc, json_path_writer.as_str(), val); @@ -362,7 +345,6 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( record_json_value_to_columnar_writer( doc, el, - expand_dots, remaining_depth_limit, json_path_writer, columnar_writer, @@ -374,7 +356,6 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( record_json_obj_to_columnar_writer::( doc, object, - expand_dots, remaining_depth_limit, json_path_writer, columnar_writer, @@ -387,6 +368,7 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( #[cfg(test)] mod tests { use columnar::{Column, ColumnarReader, ColumnarWriter, StrColumn}; + use common::JsonPathWriter; use super::record_json_value_to_columnar_writer; use crate::fastfield::writer::JSON_DEPTH_LIMIT; @@ -397,12 +379,12 @@ mod tests { expand_dots: bool, ) -> ColumnarReader { let mut columnar_writer = ColumnarWriter::default(); - let mut json_path = String::new(); + let mut json_path = JsonPathWriter::default(); + json_path.set_expand_dots(expand_dots); for (doc, json_doc) in json_docs.iter().enumerate() { record_json_value_to_columnar_writer( doc as u32, json_doc, - expand_dots, JSON_DEPTH_LIMIT, &mut json_path, &mut columnar_writer,