mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
json path writer (#2224)
* refactor logic to JsonPathWriter * use in encode_column_name * add inlines * move unsafe block
This commit is contained in:
112
common/src/json_path_writer.rs
Normal file
112
common/src/json_path_writer.rs
Normal file
@@ -0,0 +1,112 @@
|
||||
use crate::replace_in_place;
|
||||
|
||||
/// Separates the different segments of a json path.
|
||||
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
|
||||
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
|
||||
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };
|
||||
|
||||
/// Create a new JsonPathWriter, that creates flattened json paths for tantivy.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct JsonPathWriter {
|
||||
path: String,
|
||||
indices: Vec<usize>,
|
||||
expand_dots: bool,
|
||||
}
|
||||
|
||||
impl JsonPathWriter {
|
||||
pub fn new() -> Self {
|
||||
JsonPathWriter {
|
||||
path: String::new(),
|
||||
indices: Vec::new(),
|
||||
expand_dots: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// When expand_dots is enabled, json object like
|
||||
/// `{"k8s.node.id": 5}` is processed as if it was
|
||||
/// `{"k8s": {"node": {"id": 5}}}`.
|
||||
/// This option has the merit of allowing users to
|
||||
/// write queries like `k8s.node.id:5`.
|
||||
/// On the other, enabling that feature can lead to
|
||||
/// ambiguity.
|
||||
#[inline]
|
||||
pub fn set_expand_dots(&mut self, expand_dots: bool) {
|
||||
self.expand_dots = expand_dots;
|
||||
}
|
||||
|
||||
/// Push a new segment to the path.
|
||||
#[inline]
|
||||
pub fn push(&mut self, segment: &str) {
|
||||
let len_path = self.path.len();
|
||||
self.indices.push(len_path);
|
||||
if !self.path.is_empty() {
|
||||
self.path.push_str(JSON_PATH_SEGMENT_SEP_STR);
|
||||
}
|
||||
self.path.push_str(segment);
|
||||
if self.expand_dots {
|
||||
// This might include the separation byte, which is ok because it is not a dot.
|
||||
let appended_segment = &mut self.path[len_path..];
|
||||
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
|
||||
// valid single byte ut8 strings.
|
||||
// By utf-8 design, they cannot be part of another codepoint.
|
||||
unsafe {
|
||||
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment.as_bytes_mut())
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the last segment. Does nothing if the path is empty.
|
||||
#[inline]
|
||||
pub fn pop(&mut self) {
|
||||
if let Some(last_idx) = self.indices.pop() {
|
||||
self.path.truncate(last_idx);
|
||||
}
|
||||
}
|
||||
|
||||
/// Clear the path.
|
||||
#[inline]
|
||||
pub fn clear(&mut self) {
|
||||
self.path.clear();
|
||||
self.indices.clear();
|
||||
}
|
||||
|
||||
/// Get the current path.
|
||||
#[inline]
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.path
|
||||
}
|
||||
}
|
||||
|
||||
impl From<JsonPathWriter> for String {
|
||||
#[inline]
|
||||
fn from(value: JsonPathWriter) -> Self {
|
||||
value.path
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn json_path_writer_test() {
|
||||
let mut writer = JsonPathWriter::new();
|
||||
|
||||
writer.push("root");
|
||||
assert_eq!(writer.as_str(), "root");
|
||||
|
||||
writer.push("child");
|
||||
assert_eq!(writer.as_str(), "root\u{1}child");
|
||||
|
||||
writer.pop();
|
||||
assert_eq!(writer.as_str(), "root");
|
||||
|
||||
writer.push("k8s.node.id");
|
||||
assert_eq!(writer.as_str(), "root\u{1}k8s.node.id");
|
||||
|
||||
writer.set_expand_dots(true);
|
||||
writer.pop();
|
||||
writer.push("k8s.node.id");
|
||||
assert_eq!(writer.as_str(), "root\u{1}k8s\u{1}node\u{1}id");
|
||||
}
|
||||
}
|
||||
@@ -9,6 +9,7 @@ mod byte_count;
|
||||
mod datetime;
|
||||
pub mod file_slice;
|
||||
mod group_by;
|
||||
mod json_path_writer;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod writer;
|
||||
@@ -18,6 +19,7 @@ pub use byte_count::ByteCount;
|
||||
pub use datetime::DatePrecision;
|
||||
pub use datetime::{DateTime, DateTimePrecision};
|
||||
pub use group_by::GroupByIteratorExtended;
|
||||
pub use json_path_writer::JsonPathWriter;
|
||||
pub use ownedbytes::{OwnedBytes, StableDeref};
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use columnar::MonotonicallyMappableToU64;
|
||||
use common::replace_in_place;
|
||||
use common::{replace_in_place, JsonPathWriter};
|
||||
use murmurhash32::murmurhash2;
|
||||
use rustc_hash::FxHashMap;
|
||||
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
||||
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
|
||||
use crate::schema::term::JSON_PATH_SEGMENT_SEP;
|
||||
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{OffsetDateTime, UtcOffset};
|
||||
@@ -315,17 +315,13 @@ pub(crate) fn encode_column_name(
|
||||
json_path: &str,
|
||||
expand_dots_enabled: bool,
|
||||
) -> String {
|
||||
let mut column_key: String = String::with_capacity(field_name.len() + json_path.len() + 1);
|
||||
column_key.push_str(field_name);
|
||||
for mut segment in split_json_path(json_path) {
|
||||
column_key.push_str(JSON_PATH_SEGMENT_SEP_STR);
|
||||
if expand_dots_enabled {
|
||||
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
|
||||
unsafe { replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, segment.as_bytes_mut()) };
|
||||
}
|
||||
column_key.push_str(&segment);
|
||||
let mut path = JsonPathWriter::default();
|
||||
path.push(field_name);
|
||||
path.set_expand_dots(expand_dots_enabled);
|
||||
for segment in split_json_path(json_path) {
|
||||
path.push(&segment);
|
||||
}
|
||||
column_key
|
||||
path.into()
|
||||
}
|
||||
|
||||
impl<'a> JsonTermWriter<'a> {
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use std::io;
|
||||
|
||||
use columnar::{ColumnarWriter, NumericalValue};
|
||||
use common::replace_in_place;
|
||||
use common::JsonPathWriter;
|
||||
use tokenizer_api::Token;
|
||||
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
|
||||
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
|
||||
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::{DateTimePrecision, DocId, TantivyError};
|
||||
@@ -24,7 +23,7 @@ pub struct FastFieldsWriter {
|
||||
expand_dots: Vec<bool>,
|
||||
num_docs: DocId,
|
||||
// Buffer that we recycle to avoid allocation.
|
||||
json_path_buffer: String,
|
||||
json_path_buffer: JsonPathWriter,
|
||||
}
|
||||
|
||||
impl FastFieldsWriter {
|
||||
@@ -98,7 +97,7 @@ impl FastFieldsWriter {
|
||||
num_docs: 0u32,
|
||||
date_precisions,
|
||||
expand_dots,
|
||||
json_path_buffer: String::new(),
|
||||
json_path_buffer: JsonPathWriter::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -212,14 +211,16 @@ impl FastFieldsWriter {
|
||||
ReferenceValue::Object(val) => {
|
||||
let expand_dots = self.expand_dots[field.field_id() as usize];
|
||||
self.json_path_buffer.clear();
|
||||
self.json_path_buffer.push_str(field_name);
|
||||
// First field should not be expanded.
|
||||
self.json_path_buffer.set_expand_dots(false);
|
||||
self.json_path_buffer.push(field_name);
|
||||
self.json_path_buffer.set_expand_dots(expand_dots);
|
||||
|
||||
let text_analyzer = &mut self.per_field_tokenizer[field.field_id() as usize];
|
||||
|
||||
record_json_obj_to_columnar_writer::<V>(
|
||||
doc_id,
|
||||
val,
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut self.json_path_buffer,
|
||||
&mut self.columnar_writer,
|
||||
@@ -250,48 +251,30 @@ impl FastFieldsWriter {
|
||||
fn record_json_obj_to_columnar_writer<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
json_visitor: V::ObjectIter,
|
||||
expand_dots: bool,
|
||||
remaining_depth_limit: usize,
|
||||
json_path_buffer: &mut String,
|
||||
json_path_buffer: &mut JsonPathWriter,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &mut Option<TextAnalyzer>,
|
||||
) {
|
||||
for (key, child) in json_visitor {
|
||||
let len_path = json_path_buffer.len();
|
||||
if !json_path_buffer.is_empty() {
|
||||
json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR);
|
||||
}
|
||||
json_path_buffer.push_str(key);
|
||||
if expand_dots {
|
||||
// This might include the separation byte, which is ok because it is not a dot.
|
||||
let appended_segment = &mut json_path_buffer[len_path..];
|
||||
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
|
||||
// valid single byte ut8 strings.
|
||||
// By utf-8 design, they cannot be part of another codepoint.
|
||||
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, unsafe {
|
||||
appended_segment.as_bytes_mut()
|
||||
});
|
||||
}
|
||||
json_path_buffer.push(key);
|
||||
record_json_value_to_columnar_writer(
|
||||
doc,
|
||||
child,
|
||||
expand_dots,
|
||||
remaining_depth_limit,
|
||||
json_path_buffer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
);
|
||||
// popping our sub path.
|
||||
json_path_buffer.truncate(len_path);
|
||||
json_path_buffer.pop();
|
||||
}
|
||||
}
|
||||
|
||||
fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
json_val: V,
|
||||
expand_dots: bool,
|
||||
mut remaining_depth_limit: usize,
|
||||
json_path_writer: &mut String,
|
||||
json_path_writer: &mut JsonPathWriter,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &mut Option<TextAnalyzer>,
|
||||
) {
|
||||
@@ -335,7 +318,7 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
|
||||
);
|
||||
}
|
||||
ReferenceValueLeaf::Bool(val) => {
|
||||
columnar_writer.record_bool(doc, json_path_writer, val);
|
||||
columnar_writer.record_bool(doc, json_path_writer.as_str(), val);
|
||||
}
|
||||
ReferenceValueLeaf::Date(val) => {
|
||||
columnar_writer.record_datetime(doc, json_path_writer.as_str(), val);
|
||||
@@ -362,7 +345,6 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
|
||||
record_json_value_to_columnar_writer(
|
||||
doc,
|
||||
el,
|
||||
expand_dots,
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
columnar_writer,
|
||||
@@ -374,7 +356,6 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
|
||||
record_json_obj_to_columnar_writer::<V>(
|
||||
doc,
|
||||
object,
|
||||
expand_dots,
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
columnar_writer,
|
||||
@@ -387,6 +368,7 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use columnar::{Column, ColumnarReader, ColumnarWriter, StrColumn};
|
||||
use common::JsonPathWriter;
|
||||
|
||||
use super::record_json_value_to_columnar_writer;
|
||||
use crate::fastfield::writer::JSON_DEPTH_LIMIT;
|
||||
@@ -397,12 +379,12 @@ mod tests {
|
||||
expand_dots: bool,
|
||||
) -> ColumnarReader {
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
let mut json_path = String::new();
|
||||
let mut json_path = JsonPathWriter::default();
|
||||
json_path.set_expand_dots(expand_dots);
|
||||
for (doc, json_doc) in json_docs.iter().enumerate() {
|
||||
record_json_value_to_columnar_writer(
|
||||
doc as u32,
|
||||
json_doc,
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut json_path,
|
||||
&mut columnar_writer,
|
||||
|
||||
Reference in New Issue
Block a user